Skip to content

Commit

Permalink
Refactred the data file generation proccess
Browse files Browse the repository at this point in the history
  • Loading branch information
zensoup committed May 17, 2019
1 parent f607e06 commit 3768d39
Showing 1 changed file with 80 additions and 29 deletions.
109 changes: 80 additions & 29 deletions generate_character_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,99 @@
Download the latest unicode tables from https://www.unicode.org and create a .txt file
containing all the names, blocks and character codes
"""
import os
import logging
from urllib import request

curr_path = os.path.dirname(__file__)
logging.basicConfig(level=logging.DEBUG)

def main():

def get_blocks():
""" Download the info file for Unicode blocks.
"""
logging.info("Downloading block data...")
req = request.urlopen("https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt")
content = req.read().decode()
logging.info("Done")
return content


def get_data():
""" Download the info file for Unicode blocks.
"""
logging.info("Downloading block data...")
req = request.urlopen(
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
)
content = req.read().decode()
logging.info("Done")
return content


def clean(text):
""" Remove all blank or commented lies from a string
"""
lines = text.strip().split("\n")
clean_lines = [line.strip() for line in lines if line.strip() and line[0] != "#"]
return "\n".join(clean_lines)


def load_blocks():
""" Load and parse the block data and return a function that provides block
search based on a character code.
"""
indices = []
blocks = []
with open("Blocks.txt", "r") as block_file:
for line in block_file.readlines():
if line.startswith("#"):
continue
l, name = line.split(";")
start, stop = l.split("..")
indices.append((int(start, 16), int(stop, 16)))
blocks.append(name.strip())
block_data = clean(get_blocks())
for line in block_data.split("\n"):
l, name = line.split(";")
start, stop = l.split("..")
indices.append((int(start, 16), int(stop, 16)))
blocks.append(name.strip())

def locate_block(code):
for index, [start, stop] in enumerate(indices):
if code > stop:
continue
else:
if code >= start:
return index
return blocks[index]

return locate_block


def main():
get_block = load_blocks()
characters = clean(get_data())

logging.info("Parsing character data...")

output = []
for line in characters.split("\n"):
# Parse the needed data
attributes = line.strip().split(";")
code = attributes[0]
name = attributes[1]
comment = attributes[10]

# Convert character code to unicode
try:
num = int(code, 16)
except ValueError:
logging.warn("Could not convert " + code)
continue

# Find the character's block
blk = get_block(num)
if blk is not None:
output.append("\t".join((name, comment, code, blk)))
else:
logging.warn("Code %s not found in any block, char: %s", num, unichr(num))
output.append(name + "\t" + comment + "\t" + code + "\t")

with open("unicode_list.txt", "w") as target:
with open("UnicodeData.txt", "r") as names:
for line in names.readlines():
attributes = line.strip().split(";")
code = attributes[0]
name = attributes[1]
comment = attributes[10]
try:
num = int(code, 16)
except ValueError:
print("could not convert " + code)
continue
index = locate_block(num)
if index is not None:
target.write(name + "\t" + comment + "\t" + code + "\t" + blocks[index] + "\n")
else:
print(
"Code " + str(num) + " not found in block, char: " + unichr(num)
)
target.write(name + "\t" + comment + "\t" + code + "\t" + "\n")
target.write("\n".join(output))


if __name__ == "__main__":
Expand Down

0 comments on commit 3768d39

Please sign in to comment.