Skip to content

Commit

Permalink
Implement script extension support in JIT. (#66)
Browse files Browse the repository at this point in the history
Fix incorect operator in GenerateUcd.py (modulo -> bitwise and)

Co-authored-by: Zoltan Herczeg <[email protected]>
  • Loading branch information
zherczeg and Zoltan Herczeg authored Dec 29, 2021
1 parent afa4756 commit 6614b28
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 63 deletions.
2 changes: 1 addition & 1 deletion maint/GenerateUcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ def write_records(records, record_size):
bitwords = [0] * script_list_item_size

for idx in d:
bitwords[idx // 32] |= 1 << (idx % 31)
bitwords[idx // 32] |= 1 << (idx & 31)

s = " "
for x in bitwords:
Expand Down
54 changes: 50 additions & 4 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7417,9 +7417,10 @@ static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHA
#define XCLASS_CHAR_SAVED 0x02
#define XCLASS_HAS_TYPE 0x04
#define XCLASS_HAS_SCRIPT 0x08
#define XCLASS_HAS_BIDICO 0x10
#define XCLASS_HAS_BIDICL 0x20
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10
#define XCLASS_HAS_BIDICO 0x20
#define XCLASS_HAS_BIDICL 0x40
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#endif /* SUPPORT_UNICODE */

static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
Expand Down Expand Up @@ -7518,6 +7519,10 @@ while (*cc != XCL_END)
unicode_status |= XCLASS_HAS_TYPE;
break;

case PT_SCX:
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
compares++;

case PT_SC:
unicode_status |= XCLASS_HAS_SCRIPT;
break;
Expand Down Expand Up @@ -7674,7 +7679,7 @@ if (unicode_status & XCLASS_NEEDS_UCD)
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SC)
if (*cc == PT_SC || *cc == PT_SCX)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
Expand All @@ -7690,6 +7695,46 @@ if (unicode_status & XCLASS_NEEDS_UCD)
cc = ccbegin;
}

if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
{
while (*cc != XCL_END)
{
if (*cc == XCL_SINGLE)
{
cc ++;
GETCHARINCTEST(c, cc);
}
else if (*cc == XCL_RANGE)
{
cc ++;
GETCHARINCTEST(c, cc);
GETCHARINCTEST(c, cc);
}
else
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SCX)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));

compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
add_jump(compiler, compares > 0 ? list : backtracks, jump);
}
cc += 2;
}
}

cc = ccbegin;
}

if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL))
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bidi));
Expand Down Expand Up @@ -7879,6 +7924,7 @@ while (*cc != XCL_END)
break;

case PT_SC:
case PT_SCX:
case PT_BIDICO:
case PT_BIDICL:
compares++;
Expand Down
84 changes: 42 additions & 42 deletions src/pcre2_ucd.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,66 +172,66 @@ const uint32_t PRIV(ucd_script_sets)[] = {
0x00000000u, 0x00000000u, 0x00000000u,
0x00000002u, 0x00000000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00008000u, 0x00000000u,
0x00000000u, 0x00004000u, 0x00000000u,
0x00000800u, 0x00000000u, 0x00000000u,
0x00004000u, 0x00000000u, 0x00000000u,
0x00100000u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00000000u, 0x00000004u,
0x00000000u, 0x00000000u, 0x00000001u,
0x20000000u, 0x00000000u, 0x00000000u,
0x00000021u, 0x00000000u, 0x00000000u,
0x00000001u, 0x00000001u, 0x00000000u,
0x00000001u, 0x00000040u, 0x00000000u,
0x00000001u, 0x40000000u, 0x00000000u,
0x00000001u, 0x00000020u, 0x00000000u,
0x20000001u, 0x00000000u, 0x00000000u,
0x00000001u, 0x00000010u, 0x00000000u,
0x00000001u, 0x00000008u, 0x00000000u,
0x00000102u, 0x00000000u, 0x00000000u,
0x00004004u, 0x00000000u, 0x00000000u,
0x00000008u, 0x00000200u, 0x00000000u,
0x00000008u, 0x00000100u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000000u,
0x00000480u, 0x00000000u, 0x00000000u,
0x00100080u, 0x00000000u, 0x00000000u,
0x00000080u, 0x00800000u, 0x00000000u,
0x00000080u, 0x00400000u, 0x00000000u,
0x20000080u, 0x00000000u, 0x00000000u,
0x00000100u, 0x00010000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000004u,
0x00000100u, 0x00002000u, 0x00000000u,
0x00000100u, 0x00000004u, 0x00000000u,
0x00000100u, 0x00008000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000001u,
0x00000100u, 0x00001000u, 0x00000000u,
0x00000100u, 0x00000002u, 0x00000000u,
0x00100200u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00010004u, 0x00000000u,
0x00001000u, 0x00020000u, 0x00000000u,
0x00002000u, 0x04000000u, 0x00000000u,
0x00000000u, 0x00008002u, 0x00000000u,
0x00001000u, 0x00010000u, 0x00000000u,
0x00002000u, 0x02000000u, 0x00000000u,
0x00104000u, 0x00000000u, 0x00000000u,
0x000a0000u, 0x00000000u, 0x00000000u,
0x00040000u, 0x00000000u, 0x00000004u,
0x00040000u, 0x00000000u, 0x00000001u,
0x01100000u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00200000u, 0x00000020u,
0x01000000u, 0x00000080u, 0x00000000u,
0x20000001u, 0x00000010u, 0x00000000u,
0x00000001u, 0x00000010u, 0x00000008u,
0x10000002u, 0x00001000u, 0x00000000u,
0x02000000u, 0x00001002u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000010u,
0x00400040u, 0x00080000u, 0x00000000u,
0x00040100u, 0x00010000u, 0x00000000u,
0x00100100u, 0x00010000u, 0x00000000u,
0x00000000u, 0x00100000u, 0x00000008u,
0x01000000u, 0x00000040u, 0x00000000u,
0x20000001u, 0x00000008u, 0x00000000u,
0x00000001u, 0x00000008u, 0x00000002u,
0x10000002u, 0x00000800u, 0x00000000u,
0x02000000u, 0x00000801u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000004u,
0x00400040u, 0x00040000u, 0x00000000u,
0x00040100u, 0x00008000u, 0x00000000u,
0x00100100u, 0x00008000u, 0x00000000u,
0x000a4000u, 0x00000000u, 0x00000000u,
0x02100000u, 0x00000100u, 0x00000000u,
0x00040102u, 0x00010000u, 0x00000000u,
0x40010011u, 0x00000000u, 0x00000000u,
0x00000100u, 0x20100400u, 0x00000000u,
0x02100000u, 0x00000080u, 0x00000000u,
0x00040102u, 0x00008000u, 0x00000000u,
0xc0010010u, 0x00000000u, 0x00000000u,
0x00000100u, 0x10080200u, 0x00000000u,
0x000ac004u, 0x00000000u, 0x00000000u,
0x20000001u, 0x00000051u, 0x00000008u,
0x000ac004u, 0x00000020u, 0x00000000u,
0x04840100u, 0x0000000cu, 0x00000000u,
0x20000001u, 0x08000051u, 0x00000008u,
0x04040102u, 0x02010008u, 0x00000004u,
0x20000001u, 0x09200803u, 0x00000020u,
0x00003100u, 0x22564400u, 0x00000000u,
0x04943102u, 0x0201000cu, 0x00000000u,
0x04943102u, 0x0201200cu, 0x00000000u,
0x00043100u, 0x22564400u, 0x00000004u,
0x00843100u, 0x22564400u, 0x00000004u,
0x1c843102u, 0x7215400cu, 0x00000004u,
0x1ca43102u, 0x7215400cu, 0x00000004u,
0x20000001u, 0x40000028u, 0x00000002u,
0x000ac004u, 0x00000010u, 0x00000000u,
0x04840100u, 0x00000006u, 0x00000000u,
0x20000001u, 0x44000028u, 0x00000002u,
0x04040102u, 0x01008004u, 0x00000001u,
0x20000001u, 0xc4900400u, 0x00000008u,
0x00003100u, 0x112b2200u, 0x00000000u,
0x04943102u, 0x01008006u, 0x00000000u,
0x04943102u, 0x01009006u, 0x00000000u,
0x00043100u, 0x112b2200u, 0x00000001u,
0x00843100u, 0x112b2200u, 0x00000001u,
0x1c843102u, 0x390aa006u, 0x00000001u,
0x1ca43102u, 0x390aa006u, 0x00000001u,
};

/* These are the main two-stage UCD tables. The fields in each record are:
Expand Down
4 changes: 0 additions & 4 deletions testdata/testinput4
Original file line number Diff line number Diff line change
Expand Up @@ -1133,8 +1133,6 @@
A\x{300}\x{301}\x{302}BC
\x{300}

#subject no_jit

/^\p{Han}+/utf
\x{2e81}\x{3007}\x{2f804}\x{31a0}
\= Expect no match
Expand All @@ -1157,8 +1155,6 @@
\x{a014}
\x{a4c6}

#subject -no_jit

/^\p{Any}X/utf
AXYZ
\x{1234}XYZ
Expand Down
4 changes: 0 additions & 4 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -1337,8 +1337,6 @@

# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE

#subject no_jit

/^[\p{Batak}]/utf
\x{1bc0}
\x{1bff}
Expand All @@ -1358,8 +1356,6 @@
\x{85c}
\x{85d}

#subject -no_jit

/(\X*)(.)/s,utf
A\x{300}

Expand Down
4 changes: 0 additions & 4 deletions testdata/testoutput4
Original file line number Diff line number Diff line change
Expand Up @@ -1876,8 +1876,6 @@ No match
\x{300}
0: \x{300}

#subject no_jit

/^\p{Han}+/utf
\x{2e81}\x{3007}\x{2f804}\x{31a0}
0: \x{2e81}\x{3007}\x{2f804}
Expand Down Expand Up @@ -1910,8 +1908,6 @@ No match
\x{a4c6}
No match

#subject -no_jit

/^\p{Any}X/utf
AXYZ
0: AX
Expand Down
4 changes: 0 additions & 4 deletions testdata/testoutput5
Original file line number Diff line number Diff line change
Expand Up @@ -2842,8 +2842,6 @@ No match

# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE

#subject no_jit

/^[\p{Batak}]/utf
\x{1bc0}
0: \x{1bc0}
Expand Down Expand Up @@ -2873,8 +2871,6 @@ No match
\x{85d}
No match

#subject -no_jit

/(\X*)(.)/s,utf
A\x{300}
0: A
Expand Down

0 comments on commit 6614b28

Please sign in to comment.