-
Notifications
You must be signed in to change notification settings - Fork 202
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Auto generate unicode property tests.
- Loading branch information
Zoltan Herczeg
committed
Dec 30, 2021
1 parent
6614b28
commit 14d338e
Showing
4 changed files
with
2,064 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#! /usr/bin/python | ||
|
||
# PCRE2 UNICODE PROPERTY SUPPORT | ||
# ------------------------------ | ||
# | ||
# This file auto-generates unicode property tests and their expected output. | ||
# It is recommended to re-run this generator after the unicode files are | ||
# updated. The names of the generated files are `testinput26` and `testoutput26` | ||
|
||
import re | ||
import sys | ||
|
||
from GenerateCommon import \ | ||
script_names, \ | ||
script_abbrevs | ||
|
||
def write_both(text): | ||
input_file.write(text) | ||
output_file.write(text) | ||
|
||
output_directory = "" | ||
|
||
if len(sys.argv) > 2: | ||
print('** Too many arguments: just give a directory name') | ||
sys.exit(1) | ||
if len(sys.argv) == 2: | ||
output_directory = sys.argv[1] | ||
if not output_directory.endswith("/"): | ||
output_directory += "/" | ||
|
||
try: | ||
input_file = open(output_directory + "testinput26", "w") | ||
output_file = open(output_directory + "testoutput26", "w") | ||
except IOError: | ||
print ("** Couldn't open output files") | ||
sys.exit(1) | ||
|
||
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n") | ||
|
||
# --------------------------------------------------------------------------- | ||
# UNICODE SCRIPT EXTENSION TESTS | ||
# --------------------------------------------------------------------------- | ||
|
||
write_both("# Unicode Script Extension tests.\n\n") | ||
|
||
def gen_script_extension_tests(): | ||
extended_scripts = [] | ||
extended_script_indicies = {} | ||
|
||
with open("Unicode.tables/ScriptExtensions.txt") as f: | ||
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #") | ||
|
||
for line in f: | ||
match_obj = property_re.match(line) | ||
|
||
if match_obj == None: | ||
continue | ||
|
||
low = int(match_obj.group(1), 16) | ||
high = low | ||
if match_obj.group(2) != None: | ||
high = int(match_obj.group(2), 16) | ||
|
||
for name in match_obj.group(3).split(" "): | ||
if name not in extended_script_indicies: | ||
extended_script_indicies[name] = len(extended_scripts) | ||
extended_scripts.append([name, low, high]) | ||
continue | ||
|
||
rec = extended_scripts[extended_script_indicies[name]] | ||
if rec[1] > low: | ||
rec[1] = low | ||
if rec[2] < high: | ||
rec[2] = high | ||
|
||
long_property_name = False | ||
|
||
for rec in extended_scripts: | ||
script_name = script_names[script_abbrevs.index(rec[0])] | ||
|
||
write_both("/^\\p{%s}/utf\n" % script_name) | ||
write_both(" \\x{%x}\n" % rec[1]) | ||
output_file.write(" 0: \\x{%x}\n" % rec[1]) | ||
if rec[1] != rec[2]: | ||
write_both(" \\x{%x}\n" % rec[2]) | ||
output_file.write(" 0: \\x{%x}\n" % rec[2]) | ||
write_both("\n") | ||
|
||
property_name = "scx" | ||
if long_property_name: | ||
property_name = "Script_Extensions" | ||
|
||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, rec[0])) | ||
write_both(" \\x{%x}\n" % rec[1]) | ||
output_file.write(" 0: \\x{%x}\n" % rec[1]) | ||
if rec[1] != rec[2]: | ||
write_both(" \\x{%x}\n" % rec[2]) | ||
output_file.write(" 0: \\x{%x}\n" % rec[2]) | ||
write_both("\n") | ||
|
||
property_name = "sc" | ||
if long_property_name: | ||
property_name = "Script" | ||
|
||
# Some negative tests are not working because script extensions overlap | ||
# with scripts. These scripts are excluded. This list needs to be | ||
# maintained by hand at the moment. | ||
|
||
if script_name not in ("Devanagari", "Manichaean", "Cyrillic", "Tamil", "Myanmar"): | ||
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_name)) | ||
|
||
write_both(" \\x{%x}\n" % rec[1]) | ||
output_file.write("No match\n") | ||
|
||
if rec[1] != rec[2]: | ||
write_both(" \\x{%x}\n" % rec[2]) | ||
output_file.write("No match\n") | ||
write_both("\n") | ||
|
||
long_property_name = not long_property_name | ||
|
||
gen_script_extension_tests() | ||
|
||
write_both("# End of testinput26\n") |
Oops, something went wrong.