-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild-db.py
64 lines (55 loc) · 1.17 KB
/
build-db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
import sqlite3
import csv
import os
import io
import urllib3
currency = []
arrows = []
def normalize(x):
return re.sub("[^a-zA-Z0-9]+", " ", x).lower()
try:
os.remove("utf8.db")
except Exception:
pass
http = urllib3.PoolManager()
db = sqlite3.connect("utf8.db")
db.execute(
"""
CREATE TABLE chars (
name STRING PRIMARY KEY,
ordinal INTEGER UNIQUE,
category STRING DEFAULT ''
);
"""
)
unicodedata = http.request(
"GET",
f"https://www.unicode.org/Public/draft/UCD/ucd/UnicodeData.txt",
preload_content=False,
)
unicodedata.auto_close = False
reader = csv.reader(io.TextIOWrapper(unicodedata, encoding="utf-8"), delimiter=";")
for row in reader:
ordinal = int(row[0], 16)
name = row[1].lower().replace(" ", "-")
if "<" in name: # Skip control characters
continue
category = None
group = row[2]
if group == "Sc":
category = "currency"
if "arrow" in name:
category = "arrows"
db.execute(
"""
INSERT INTO chars (
name, ordinal, category
) VALUES (
?, ?, ?
)
""",
(name, ordinal, category),
)
db.commit()
db.close()