-
Notifications
You must be signed in to change notification settings - Fork 202
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Auto generate unicode property tests.
- Loading branch information
Zoltan Herczeg
committed
Dec 30, 2021
1 parent
6614b28
commit 05bfaef
Showing
4 changed files
with
6,420 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
#! /usr/bin/python | ||
|
||
# PCRE2 UNICODE PROPERTY SUPPORT | ||
# ------------------------------ | ||
# | ||
# This file auto-generates unicode property tests and their expected output. | ||
# It is recommended to re-run this generator after the unicode files are | ||
# updated. The names of the generated files are `testinput26` and `testoutput26` | ||
|
||
import re | ||
import sys | ||
|
||
from GenerateCommon import \ | ||
script_names, \ | ||
script_abbrevs | ||
|
||
def write_both(text): | ||
input_file.write(text) | ||
output_file.write(text) | ||
|
||
def to_string_char(ch_idx): | ||
if ch_idx < 128: | ||
if ch_idx < 16: | ||
return "\\x{0%x}" % ch_idx | ||
if ch_idx >= 32: | ||
return chr(ch_idx) | ||
return "\\x{%x}" % ch_idx | ||
|
||
output_directory = "" | ||
|
||
if len(sys.argv) > 2: | ||
print('** Too many arguments: just give a directory name') | ||
sys.exit(1) | ||
if len(sys.argv) == 2: | ||
output_directory = sys.argv[1] | ||
if not output_directory.endswith("/"): | ||
output_directory += "/" | ||
|
||
try: | ||
input_file = open(output_directory + "testinput26", "w") | ||
output_file = open(output_directory + "testoutput26", "w") | ||
except IOError: | ||
print ("** Couldn't open output files") | ||
sys.exit(1) | ||
|
||
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n") | ||
|
||
# --------------------------------------------------------------------------- | ||
# UNICODE SCRIPT EXTENSION TESTS | ||
# --------------------------------------------------------------------------- | ||
|
||
write_both("# Unicode Script Extension tests.\n\n") | ||
|
||
def gen_script_tests(): | ||
script_data = [None] * len(script_names) | ||
char_data = [None] * 0x110000 | ||
|
||
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #") | ||
prev_name = "" | ||
script_idx = -1 | ||
|
||
with open("Unicode.tables/Scripts.txt") as f: | ||
for line in f: | ||
match_obj = property_re.match(line) | ||
|
||
if match_obj == None: | ||
continue | ||
|
||
name = match_obj.group(3) | ||
if name != prev_name: | ||
script_idx = script_names.index(name) | ||
prev_name = name | ||
|
||
low = int(match_obj.group(1), 16) | ||
high = low | ||
char_data[low] = name | ||
|
||
if match_obj.group(2) != None: | ||
high = int(match_obj.group(2), 16) | ||
for idx in range(low + 1, high + 1): | ||
char_data[idx] = name | ||
|
||
if script_data[script_idx] == None: | ||
script_data[script_idx] = [low, None, None, None, None] | ||
script_data[script_idx][1] = high | ||
|
||
extended_script_indicies = {} | ||
|
||
with open("Unicode.tables/ScriptExtensions.txt") as f: | ||
for line in f: | ||
match_obj = property_re.match(line) | ||
|
||
if match_obj == None: | ||
continue | ||
|
||
low = int(match_obj.group(1), 16) | ||
high = low | ||
if match_obj.group(2) != None: | ||
high = int(match_obj.group(2), 16) | ||
|
||
for abbrev in match_obj.group(3).split(" "): | ||
if abbrev not in extended_script_indicies: | ||
idx = script_abbrevs.index(abbrev) | ||
extended_script_indicies[abbrev] = idx | ||
rec = script_data[idx] | ||
rec[2] = low | ||
rec[3] = high | ||
else: | ||
idx = extended_script_indicies[abbrev] | ||
rec = script_data[idx] | ||
if rec[2] > low: | ||
rec[2] = low | ||
if rec[3] < high: | ||
rec[3] = high | ||
|
||
if rec[4] == None: | ||
name = script_names[idx] | ||
for idx in range(low, high + 1): | ||
if char_data[idx] != name: | ||
rec[4] = idx | ||
break | ||
|
||
long_property_name = False | ||
|
||
for idx, rec in enumerate(script_data): | ||
script_name = script_names[idx] | ||
|
||
if script_name == "Unknown": | ||
continue | ||
|
||
script_abbrev = script_abbrevs[idx] | ||
|
||
write_both("# Base script check\n") | ||
write_both("/^\\p{sc=%s}/utf\n" % script_name) | ||
write_both(" %s\n" % to_string_char(rec[0])) | ||
output_file.write(" 0: %s\n" % to_string_char(rec[0])) | ||
write_both("\n") | ||
|
||
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) | ||
write_both(" %s\n" % to_string_char(rec[1])) | ||
output_file.write(" 0: %s\n" % to_string_char(rec[1])) | ||
write_both("\n") | ||
|
||
if rec[2] != None: | ||
property_name = "scx" | ||
if long_property_name: | ||
property_name = "Script_Extensions" | ||
|
||
write_both("# Script extension check\n") | ||
write_both("/^\\p{%s}/utf\n" % script_name) | ||
write_both(" %s\n" % to_string_char(rec[2])) | ||
output_file.write(" 0: %s\n" % to_string_char(rec[2])) | ||
write_both("\n") | ||
|
||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) | ||
write_both(" %s\n" % to_string_char(rec[3])) | ||
output_file.write(" 0: %s\n" % to_string_char(rec[3])) | ||
write_both("\n") | ||
|
||
long_property_name = not long_property_name | ||
|
||
if rec[4] != None: | ||
write_both("# Script extension only character\n") | ||
write_both("/^\\p{%s}/utf\n" % script_name) | ||
write_both(" %s\n" % to_string_char(rec[4])) | ||
output_file.write(" 0: %s\n" % to_string_char(rec[4])) | ||
write_both("\n") | ||
|
||
write_both("/^\\p{sc=%s}/utf\n" % script_name) | ||
write_both(" %s\n" % to_string_char(rec[4])) | ||
output_file.write("No match\n") | ||
write_both("\n") | ||
else: | ||
print("External character has not found for %s" % script_name) | ||
|
||
high = rec[1] | ||
if rec[3] != None and rec[3] > rec[1]: | ||
high = rec[3] | ||
write_both("# Character not in script\n") | ||
write_both("/^\\p{%s}/utf\n" % script_name) | ||
write_both(" %s\n" % to_string_char(high + 1)) | ||
output_file.write("No match\n") | ||
write_both("\n") | ||
|
||
|
||
gen_script_tests() | ||
|
||
write_both("# End of testinput26\n") |
Oops, something went wrong.