Skip to content

Commit

Permalink
Auto generate unicode property tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
Zoltan Herczeg committed Dec 30, 2021
1 parent 6614b28 commit 14d338e
Show file tree
Hide file tree
Showing 4 changed files with 2,064 additions and 2 deletions.
23 changes: 21 additions & 2 deletions RunTest
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
title23="Test 23: \C disabled test"
title24="Test 24: Non-UTF pattern conversion tests"
title25="Test 25: UTF pattern conversion tests"
maxtest=25
title26="Test 26: Auto-generated unicode property tests"
maxtest=26

if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title0
Expand Down Expand Up @@ -109,6 +110,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title23
echo $title24
echo $title25
echo $title26
exit 0
fi

Expand Down Expand Up @@ -238,6 +240,7 @@ do22=no
do23=no
do24=no
do25=no
do26=no

while [ $# -gt 0 ] ; do
case $1 in
Expand Down Expand Up @@ -267,6 +270,7 @@ while [ $# -gt 0 ] ; do
23) do23=yes;;
24) do24=yes;;
25) do25=yes;;
26) do26=yes;;
-8) arg8=yes;;
-16) arg16=yes;;
-32) arg32=yes;;
Expand Down Expand Up @@ -417,7 +421,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
$do24 = no -a $do25 = no \
$do24 = no -a $do25 = no -a $do26 = no \
]; then
do0=yes
do1=yes
Expand Down Expand Up @@ -445,6 +449,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
do23=yes
do24=yes
do25=yes
do26=yes
fi

# Handle any explicit skips at this stage, so that an argument list may consist
Expand Down Expand Up @@ -863,6 +868,20 @@ for bmode in "$test8" "$test16" "$test32"; do
fi
fi

# Auto-generated unicode property tests

if [ $do26 = yes ] ; then
echo $title26
if [ $utf -eq 0 ] ; then
echo " Skipped because UTF-$bits support is not available"
else
for opt in "" $jitopt; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
checkresult $? 26 "$opt"
done
fi
fi

# End of loop for 8/16/32-bit tests
done

Expand Down
124 changes: 124 additions & 0 deletions maint/GenerateTest26.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#! /usr/bin/python

# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
#
# This file auto-generates unicode property tests and their expected output.
# It is recommended to re-run this generator after the unicode files are
# updated. The names of the generated files are `testinput26` and `testoutput26`

import re
import sys

from GenerateCommon import \
script_names, \
script_abbrevs

def write_both(text):
input_file.write(text)
output_file.write(text)

output_directory = ""

if len(sys.argv) > 2:
print('** Too many arguments: just give a directory name')
sys.exit(1)
if len(sys.argv) == 2:
output_directory = sys.argv[1]
if not output_directory.endswith("/"):
output_directory += "/"

try:
input_file = open(output_directory + "testinput26", "w")
output_file = open(output_directory + "testoutput26", "w")
except IOError:
print ("** Couldn't open output files")
sys.exit(1)

write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")

# ---------------------------------------------------------------------------
# UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------

write_both("# Unicode Script Extension tests.\n\n")

def gen_script_extension_tests():
extended_scripts = []
extended_script_indicies = {}

with open("Unicode.tables/ScriptExtensions.txt") as f:
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")

for line in f:
match_obj = property_re.match(line)

if match_obj == None:
continue

low = int(match_obj.group(1), 16)
high = low
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)

for name in match_obj.group(3).split(" "):
if name not in extended_script_indicies:
extended_script_indicies[name] = len(extended_scripts)
extended_scripts.append([name, low, high])
continue

rec = extended_scripts[extended_script_indicies[name]]
if rec[1] > low:
rec[1] = low
if rec[2] < high:
rec[2] = high

long_property_name = False

for rec in extended_scripts:
script_name = script_names[script_abbrevs.index(rec[0])]

write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" \\x{%x}\n" % rec[1])
output_file.write(" 0: \\x{%x}\n" % rec[1])
if rec[1] != rec[2]:
write_both(" \\x{%x}\n" % rec[2])
output_file.write(" 0: \\x{%x}\n" % rec[2])
write_both("\n")

property_name = "scx"
if long_property_name:
property_name = "Script_Extensions"

write_both("/^\\p{%s=%s}/utf\n" % (property_name, rec[0]))
write_both(" \\x{%x}\n" % rec[1])
output_file.write(" 0: \\x{%x}\n" % rec[1])
if rec[1] != rec[2]:
write_both(" \\x{%x}\n" % rec[2])
output_file.write(" 0: \\x{%x}\n" % rec[2])
write_both("\n")

property_name = "sc"
if long_property_name:
property_name = "Script"

# Some negative tests are not working because script extensions overlap
# with scripts. These scripts are excluded. This list needs to be
# maintained by hand at the moment.

if script_name not in ("Devanagari", "Manichaean", "Cyrillic", "Tamil", "Myanmar"):
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_name))

write_both(" \\x{%x}\n" % rec[1])
output_file.write("No match\n")

if rec[1] != rec[2]:
write_both(" \\x{%x}\n" % rec[2])
output_file.write("No match\n")
write_both("\n")

long_property_name = not long_property_name

gen_script_extension_tests()

write_both("# End of testinput26\n")
Loading

0 comments on commit 14d338e

Please sign in to comment.