-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathgenerate_cellwidth_table.py
151 lines (121 loc) · 5.39 KB
/
generate_cellwidth_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import requests
categories_url = (
"https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt"
)
east_asian_url = "https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt"
def parse_categories():
"""Download and parse the latest Unicode Category document
Return a tuple of (low, high, width, description) for each range covering
categories Enclosing Marks (Me), Non-Spacing Marks (Mn) and Control Codes (Cc),
all of which are zero-width
"""
# example lines:
# 0591..05BD ; Mn # [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
# 05BF ; Mn # HEBREW POINT RAFE
text = requests.get(categories_url).text
categories = ("Me", "Mn", "Cc")
ranges = []
for line in text.splitlines():
if not line or line.startswith("#"):
continue
points, _, category, _, desc = line.split(maxsplit=4)
if category in categories:
low, high = parse_points(points)
ranges.append((low, high, 0, parse_desc(desc)))
return ranges
def parse_east_asian():
"""Download and parse the latest Unicode East Asian Width document.
Note that the document covers all codepoints, not just East Asian characters.
Return a tuple of (low, high, width, description) for each range covering width properties
`F` or `W`, the enclosing Fullwidth & Halfwidth characters, all of which have a width of two cells.
"""
# example lines
# 2322..2328;N # So [7] FROWN..KEYBOARD
# 2329;W # Ps LEFT-POINTING ANGLE BRACKET
text = requests.get(east_asian_url).text
properties = ("F", "W")
ranges = []
# All glyphs in the following blocks have an emoji representation (see
# https://en.wikipedia.org/wiki/Emoji#Emoji_versus_text_presentation), but EastAsianWidth.txt
# lists a number of codepoints in them with width 'N' or 'A', often in ways that seem arbitrary
# but probably have some historical explanation.
#
# For example, U+1F004 🀄 MAHJONG TILE RED DRAGON is listed as 'W',
# but U+1F005 🀅 MAHJONG TILE GREEN DRAGON is listed as 'N'.
#
# Since most modern terminals display all of these codepoints as emoji, we list them as 2 cells
# wide. This list could be refined further; for example, the Miscellaneous Symbols block
# includes both single and double width characters, but not all of the double width characters
# are listed as such.
override_ranges = [
(0x1F000, 0x1F02F, 2, "Mahjong Tiles"),
(0x1F0A0, 0x1F0FF, 2, "Playing Cards"),
(0x1F300, 0x1F5FF, 2, "Miscellaneous Symbols and Pictographs"),
(0x1F600, 0x1F64F, 2, "Emoticons"),
(0x1F680, 0x1F6FF, 2, "Transport and Map Symbols"),
(0x1F900, 0x1F9FF, 2, "Supplemental Symbols and Pictographs"),
(0x1FA70, 0x1FAFF, 2, "Symbols and Pictographs Extended-A"),
]
for line in text.splitlines():
if not line or line.startswith("#"):
continue
field, _, _, desc = line.split(maxsplit=3)
points, prop = field.split(";")
if prop in properties:
low, high = parse_points(points)
if any(it[0] <= low <= it[1] for it in override_ranges):
continue
ranges.append((low, high, 2, parse_desc(desc)))
return ranges + override_ranges
def parse_cf():
"""Return a table of zero-width characters from categories that
contain both zero-width and non-zero-width characters.
This table is curated by hand since the unicode document doesn't
provide enough info to parse these.
"""
return [
(0x034F, 0x034F, 0, "COMBINING GRAPHEME JOINER"),
(0x200B, 0x200F, 0, "ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK"),
(0x2028, 0x202E, 0, "LINE SEPARATOR..RIGHT-TO-LEFT OVERRIDE"),
(0x2060, 0x2063, 0, "WORD JOINER..INVISIBLE SEPARATOR"),
]
def parse_desc(desc):
if desc.startswith("["):
return desc[desc.index("]") + 2 :]
return desc
def parse_points(points):
if ".." in points:
low, high = points.split("..")
return int(low, 16), int(high, 16)
else:
point = int(points, 16)
return point, point
def parse_all():
combined = sorted(
parse_categories() + parse_east_asian() + parse_cf(), key=lambda it: it[0]
)
# concat adjacent ranges
ranges = []
iterator = iter(combined)
prev = next(iterator)
for low, high, width, desc in iterator:
if width == prev[2] and prev[1] + 1 == low:
p1, p2 = prev[3].split("..") if ".." in prev[3] else (prev[3], prev[3])
d1, d2 = desc.split("..") if ".." in desc else (desc, desc)
prev = prev[0], high, width, f"{p1}..{d2}"
else:
ranges.append(prev)
prev = (low, high, width, desc)
ranges.append(prev)
return ranges
def main():
print(
"""package com.github.ajalt.mordant.internal.gen
internal class CellWidthTableEntry(val low: Int, val high: Int, val width: Byte)
internal val CELL_WIDTH_TABLE: Array<CellWidthTableEntry> = arrayOf<CellWidthTableEntry>("""
)
for low, high, width, desc in parse_all():
print(f" CellWidthTableEntry({hex(low)}, {hex(high)}, {width}), // {desc}")
print(")")
if __name__ == "__main__":
main()