Auto generate unicode property tests.

PCRE2Project · Dec 30, 2021 · 14d338e · 14d338e
1 parent 6614b28
commit 14d338e
Show file tree

Hide file tree

Showing 4 changed files with 2,064 additions and 2 deletions.
diff --git a/RunTest b/RunTest
@@ -80,7 +80,8 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
 title23="Test 23: \C disabled test"
 title24="Test 24: Non-UTF pattern conversion tests"
 title25="Test 25: UTF pattern conversion tests"
-maxtest=25
+title26="Test 26: Auto-generated unicode property tests"
+maxtest=26
 
 if [ $# -eq 1 -a "$1" = "list" ]; then
   echo $title0
@@ -109,6 +110,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
   echo $title23
   echo $title24
   echo $title25
+  echo $title26
   exit 0
 fi
 
@@ -238,6 +240,7 @@ do22=no
 do23=no
 do24=no
 do25=no
+do26=no
 
 while [ $# -gt 0 ] ; do
   case $1 in
@@ -267,6 +270,7 @@ while [ $# -gt 0 ] ; do
    23) do23=yes;;
    24) do24=yes;;
    25) do25=yes;;
+   26) do26=yes;;
    -8) arg8=yes;;
   -16) arg16=yes;;
   -32) arg32=yes;;
@@ -417,7 +421,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
      $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
      $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
      $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
-     $do24 = no -a $do25 = no \
+     $do24 = no -a $do25 = no -a $do26 = no \
    ]; then
   do0=yes
   do1=yes
@@ -445,6 +449,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
   do23=yes
   do24=yes
   do25=yes
+  do26=yes
 fi
 
 # Handle any explicit skips at this stage, so that an argument list may consist
@@ -863,6 +868,20 @@ for bmode in "$test8" "$test16" "$test32"; do
     fi
   fi
 
+  # Auto-generated unicode property tests
+
+  if [ $do26 = yes ] ; then
+    echo $title26
+    if [ $utf -eq 0 ] ; then
+      echo "  Skipped because UTF-$bits support is not available"
+    else
+      for opt in "" $jitopt; do
+        $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
+        checkresult $? 26 "$opt"
+      done
+    fi
+  fi
+
 # End of loop for 8/16/32-bit tests
 done
 

diff --git a/maint/GenerateTest26.py b/maint/GenerateTest26.py
@@ -0,0 +1,124 @@
+#! /usr/bin/python
+
+#                   PCRE2 UNICODE PROPERTY SUPPORT
+#                   ------------------------------
+#
+# This file auto-generates unicode property tests and their expected output.
+# It is recommended to re-run this generator after the unicode files are
+# updated. The names of the generated files are `testinput26` and `testoutput26`
+
+import re
+import sys
+
+from GenerateCommon import \
+  script_names, \
+  script_abbrevs
+
+def write_both(text):
+  input_file.write(text)
+  output_file.write(text)
+
+output_directory = ""
+
+if len(sys.argv) > 2:
+  print('** Too many arguments: just give a directory name')
+  sys.exit(1)
+if len(sys.argv) == 2:
+  output_directory = sys.argv[1]
+  if not output_directory.endswith("/"):
+    output_directory += "/"
+
+try:
+  input_file = open(output_directory + "testinput26", "w")
+  output_file = open(output_directory + "testoutput26", "w")
+except IOError:
+  print ("** Couldn't open output files")
+  sys.exit(1)
+
+write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
+
+# ---------------------------------------------------------------------------
+#                      UNICODE SCRIPT EXTENSION TESTS
+# ---------------------------------------------------------------------------
+
+write_both("# Unicode Script Extension tests.\n\n")
+
+def gen_script_extension_tests():
+  extended_scripts = []
+  extended_script_indicies = {}
+
+  with open("Unicode.tables/ScriptExtensions.txt") as f:
+    property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
+
+    for line in f:
+      match_obj = property_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      low = int(match_obj.group(1), 16)
+      high = low
+      if match_obj.group(2) != None:
+        high = int(match_obj.group(2), 16)
+
+      for name in match_obj.group(3).split(" "):
+        if name not in extended_script_indicies:
+          extended_script_indicies[name] = len(extended_scripts)
+          extended_scripts.append([name, low, high])
+          continue
+
+        rec = extended_scripts[extended_script_indicies[name]]
+        if rec[1] > low:
+          rec[1] = low
+        if rec[2] < high:
+          rec[2] = high
+
+  long_property_name = False
+
+  for rec in extended_scripts:
+    script_name = script_names[script_abbrevs.index(rec[0])]
+
+    write_both("/^\\p{%s}/utf\n" % script_name)
+    write_both("  \\x{%x}\n" % rec[1])
+    output_file.write(" 0: \\x{%x}\n" % rec[1])
+    if rec[1] != rec[2]:
+      write_both("  \\x{%x}\n" % rec[2])
+      output_file.write(" 0: \\x{%x}\n" % rec[2])
+    write_both("\n")
+
+    property_name = "scx"
+    if long_property_name:
+      property_name = "Script_Extensions"
+
+    write_both("/^\\p{%s=%s}/utf\n" % (property_name, rec[0]))
+    write_both("  \\x{%x}\n" % rec[1])
+    output_file.write(" 0: \\x{%x}\n" % rec[1])
+    if rec[1] != rec[2]:
+      write_both("  \\x{%x}\n" % rec[2])
+      output_file.write(" 0: \\x{%x}\n" % rec[2])
+    write_both("\n")
+
+    property_name = "sc"
+    if long_property_name:
+      property_name = "Script"
+
+    # Some negative tests are not working because script extensions overlap
+    # with scripts. These scripts are excluded. This list needs to be
+    # maintained by hand at the moment.
+
+    if script_name not in ("Devanagari", "Manichaean", "Cyrillic", "Tamil", "Myanmar"):
+      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_name))
+
+      write_both("  \\x{%x}\n" % rec[1])
+      output_file.write("No match\n")
+
+      if rec[1] != rec[2]:
+        write_both("  \\x{%x}\n" % rec[2])
+        output_file.write("No match\n")
+      write_both("\n")
+
+    long_property_name = not long_property_name
+
+gen_script_extension_tests()
+
+write_both("# End of testinput26\n")