diff --git a/README.md b/README.md index a142a8b..279e5e4 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ This library provides circom circuits that enables you to prove that - the input string satisfies regular expressions (regexes) specified in the chip. - the substrings are correctly extracted from the input string according to substring definitions. -This is a JS/Rust adaptation of the Python regex-to-circom work done by [sampriti](https://github.com/sampritipanda/) and [yush_g](https://twitter.com/yush_g) at https://www.zkregex.com +This is a JS/Rust adaptation of the Python regex-to-circom work done by [sampriti](https://github.com/sampritipanda/) and [yush_g](https://twitter.com/yush_g), along with [sorasue](https://github.com/SoraSuegami/)'s decomposed specifications. You can generate your own regexes via our no-code tool at https://www.zkregex.com In addition to the original work, this library also supports the following features: - CLI to dynamically generate regex circuit based on regex argument @@ -22,6 +22,7 @@ You can define a regex to be proved and its substring patterns to be revealed. Specifically, there are two ways to define them: 1. (manual way) converting the regex into an equivalent determistic finite automaton (DFA), selecting state transitions for each substring pattern, and writing the transitions in a json file. 2. (automatic way) writing a decomposed version of the regex in a json file with specifying which part of the regex is revealed. +3. (no code way) put the regex into zkregex.com > tool, highlight your chosen part, and copy the generated circuit While the manual way supports more kinds of regexes than the automatic way, the latter is easier and sufficient for most regexes. ### Theory @@ -55,7 +56,7 @@ For example, if you want to verify the regex of `email was meant for @(a|b|c|d|e }, { "is_public": true, - "regex_def": "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)+" + "regex_def": "[a-z]+" }, { "is_public": false, @@ -72,7 +73,7 @@ You can generate its regex circom as follows. #### `zk-regex raw -r -s -c -t -g ` This command generates a regex circom from a raw string of the regex definition and a json file that defines state transitions in DFA to be revealed. For example, to verify the regex `1=(a|b) (2=(b|c)+ )+d` and reveal its alphabets, -1. Visualize DFA of the regex using [this website](https://mindfa.onrender.com/min_dfa). +1. Visualize DFA of the regex using [this website](https://zkregex.com). 2. Find state transitions matching with the substrings to be revealed. In this case, they are `2->3` for the alphabets after `1=`, `6->7` and `7->7` for those after `2=`, and `8->9` for `d`. 3. Make a json file at `./simple_regex_substrs.json` that defines the state transitions. For example, ``` diff --git a/package.json b/package.json index 74de367..3866c78 100644 --- a/package.json +++ b/package.json @@ -8,10 +8,10 @@ "packages/*" ], "contributors": [ - "Javier Su ", - "Kata Choi ", "Sora Suegami ", - "Yush G " + "Yush G ", + "Javier Su ", + "Kata Choi " ], "scripts": { "install": "yarn workspaces -pt run install", diff --git a/packages/circom/circuits/common/email_addr_regex.circom b/packages/circom/circuits/common/email_addr_regex.circom index 9a029d5..633015a 100644 --- a/packages/circom/circuits/common/email_addr_regex.circom +++ b/packages/circom/circuits/common/email_addr_regex.circom @@ -2,6 +2,7 @@ pragma circom 2.1.5; include "@zk-email/zk-regex-circom/circuits/regex_helpers.circom"; +// regex: (a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|!|#|$|%|&|'|\*|\+|-|/|=|\?|^|_|`|{|\||}|~|\.)+@(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|\.|-)+ template EmailAddrRegex(msg_bytes) { signal input msg[msg_bytes]; signal output out; @@ -248,6 +249,7 @@ template EmailAddrRegex(msg_bytes) { is_consecutive[msg_bytes-1-i][0] <== states[num_bytes-i][3] * (1 - is_consecutive[msg_bytes-i][1]) + is_consecutive[msg_bytes-i][1]; is_consecutive[msg_bytes-1-i][1] <== state_changed[msg_bytes-i].out * is_consecutive[msg_bytes-1-i][0]; } + // substrings calculated: [{(1, 2), (1, 1), (0, 1), (3, 3), (2, 3)}] signal is_substr0[msg_bytes][6]; signal is_reveal0[msg_bytes]; signal output reveal0[msg_bytes]; diff --git a/packages/circom/circuits/common/email_domain_regex.circom b/packages/circom/circuits/common/email_domain_regex.circom index 20f3cc5..38f05c6 100644 --- a/packages/circom/circuits/common/email_domain_regex.circom +++ b/packages/circom/circuits/common/email_domain_regex.circom @@ -2,6 +2,7 @@ pragma circom 2.1.5; include "@zk-email/zk-regex-circom/circuits/regex_helpers.circom"; +// regex: (a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|!|#|$|%|&|'|\*|\+|-|/|=|\?|^|_|`|{|\||}|~|\.)+@(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|\.|-)+ template EmailDomainRegex(msg_bytes) { signal input msg[msg_bytes]; signal output out; @@ -248,6 +249,7 @@ template EmailDomainRegex(msg_bytes) { is_consecutive[msg_bytes-1-i][0] <== states[num_bytes-i][3] * (1 - is_consecutive[msg_bytes-i][1]) + is_consecutive[msg_bytes-i][1]; is_consecutive[msg_bytes-1-i][1] <== state_changed[msg_bytes-i].out * is_consecutive[msg_bytes-1-i][0]; } + // substrings calculated: [{(3, 3), (2, 3)}] signal is_substr0[msg_bytes][3]; signal is_reveal0[msg_bytes]; signal output reveal0[msg_bytes]; diff --git a/packages/circom/circuits/common/message_id_regex.circom b/packages/circom/circuits/common/message_id_regex.circom index f89299e..e521d52 100644 --- a/packages/circom/circuits/common/message_id_regex.circom +++ b/packages/circom/circuits/common/message_id_regex.circom @@ -2,6 +2,7 @@ pragma circom 2.1.5; include "@zk-email/zk-regex-circom/circuits/regex_helpers.circom"; +// regex: (( \n)|^)message-id:<(=|@|\.|\+|_|-|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9)+> \n template MessageIdRegex(msg_bytes) { signal input msg[msg_bytes]; signal output out; @@ -270,6 +271,7 @@ template MessageIdRegex(msg_bytes) { is_consecutive[msg_bytes-1-i][0] <== states[num_bytes-i][7] * (1 - is_consecutive[msg_bytes-i][1]) + is_consecutive[msg_bytes-i][1]; is_consecutive[msg_bytes-1-i][1] <== state_changed[msg_bytes-i].out * is_consecutive[msg_bytes-1-i][0]; } + // substrings calculated: [{(17, 18), (1, 4), (1, 1), (18, 1)}] signal is_substr0[msg_bytes][5]; signal is_reveal0[msg_bytes]; signal output reveal0[msg_bytes]; diff --git a/packages/circom/tests/circuits/simple_regex_decomposed.circom b/packages/circom/tests/circuits/simple_regex_decomposed.circom new file mode 100644 index 0000000..f132bd9 --- /dev/null +++ b/packages/circom/tests/circuits/simple_regex_decomposed.circom @@ -0,0 +1,288 @@ +pragma circom 2.1.5; + +include "@zk-email/zk-regex-circom/circuits/regex_helpers.circom"; + +// regex: email was meant for @(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_)+ +template SimpleRegexDecomposed(msg_bytes) { + signal input msg[msg_bytes]; + signal output out; + + var num_bytes = msg_bytes+1; + signal in[num_bytes]; + in[0]<==255; + for (var i = 0; i < msg_bytes; i++) { + in[i+1] <== msg[i]; + } + + component eq[26][num_bytes]; + component lt[4][num_bytes]; + component and[26][num_bytes]; + component multi_or[2][num_bytes]; + signal states[num_bytes+1][24]; + component state_changed[num_bytes]; + + states[0][0] <== 1; + for (var i = 1; i < 24; i++) { + states[0][i] <== 0; + } + + for (var i = 0; i < num_bytes; i++) { + state_changed[i] = MultiOR(23); + lt[0][i] = LessEqThan(8); + lt[0][i].in[0] <== 65; + lt[0][i].in[1] <== in[i]; + lt[1][i] = LessEqThan(8); + lt[1][i].in[0] <== in[i]; + lt[1][i].in[1] <== 90; + and[0][i] = AND(); + and[0][i].a <== lt[0][i].out; + and[0][i].b <== lt[1][i].out; + lt[2][i] = LessEqThan(8); + lt[2][i].in[0] <== 97; + lt[2][i].in[1] <== in[i]; + lt[3][i] = LessEqThan(8); + lt[3][i].in[0] <== in[i]; + lt[3][i].in[1] <== 122; + and[1][i] = AND(); + and[1][i].a <== lt[2][i].out; + and[1][i].b <== lt[3][i].out; + eq[0][i] = IsEqual(); + eq[0][i].in[0] <== in[i]; + eq[0][i].in[1] <== 48; + eq[1][i] = IsEqual(); + eq[1][i].in[0] <== in[i]; + eq[1][i].in[1] <== 49; + eq[2][i] = IsEqual(); + eq[2][i].in[0] <== in[i]; + eq[2][i].in[1] <== 50; + eq[3][i] = IsEqual(); + eq[3][i].in[0] <== in[i]; + eq[3][i].in[1] <== 51; + eq[4][i] = IsEqual(); + eq[4][i].in[0] <== in[i]; + eq[4][i].in[1] <== 52; + eq[5][i] = IsEqual(); + eq[5][i].in[0] <== in[i]; + eq[5][i].in[1] <== 53; + eq[6][i] = IsEqual(); + eq[6][i].in[0] <== in[i]; + eq[6][i].in[1] <== 54; + eq[7][i] = IsEqual(); + eq[7][i].in[0] <== in[i]; + eq[7][i].in[1] <== 55; + eq[8][i] = IsEqual(); + eq[8][i].in[0] <== in[i]; + eq[8][i].in[1] <== 56; + eq[9][i] = IsEqual(); + eq[9][i].in[0] <== in[i]; + eq[9][i].in[1] <== 57; + eq[10][i] = IsEqual(); + eq[10][i].in[0] <== in[i]; + eq[10][i].in[1] <== 95; + and[2][i] = AND(); + and[2][i].a <== states[i][1]; + multi_or[0][i] = MultiOR(13); + multi_or[0][i].in[0] <== and[0][i].out; + multi_or[0][i].in[1] <== and[1][i].out; + multi_or[0][i].in[2] <== eq[0][i].out; + multi_or[0][i].in[3] <== eq[1][i].out; + multi_or[0][i].in[4] <== eq[2][i].out; + multi_or[0][i].in[5] <== eq[3][i].out; + multi_or[0][i].in[6] <== eq[4][i].out; + multi_or[0][i].in[7] <== eq[5][i].out; + multi_or[0][i].in[8] <== eq[6][i].out; + multi_or[0][i].in[9] <== eq[7][i].out; + multi_or[0][i].in[10] <== eq[8][i].out; + multi_or[0][i].in[11] <== eq[9][i].out; + multi_or[0][i].in[12] <== eq[10][i].out; + and[2][i].b <== multi_or[0][i].out; + and[3][i] = AND(); + and[3][i].a <== states[i][23]; + and[3][i].b <== multi_or[0][i].out; + multi_or[1][i] = MultiOR(2); + multi_or[1][i].in[0] <== and[2][i].out; + multi_or[1][i].in[1] <== and[3][i].out; + states[i+1][1] <== multi_or[1][i].out; + state_changed[i].in[0] <== states[i+1][1]; + eq[11][i] = IsEqual(); + eq[11][i].in[0] <== in[i]; + eq[11][i].in[1] <== 101; + and[4][i] = AND(); + and[4][i].a <== states[i][0]; + and[4][i].b <== eq[11][i].out; + states[i+1][2] <== and[4][i].out; + state_changed[i].in[1] <== states[i+1][2]; + eq[12][i] = IsEqual(); + eq[12][i].in[0] <== in[i]; + eq[12][i].in[1] <== 109; + and[5][i] = AND(); + and[5][i].a <== states[i][2]; + and[5][i].b <== eq[12][i].out; + states[i+1][3] <== and[5][i].out; + state_changed[i].in[2] <== states[i+1][3]; + eq[13][i] = IsEqual(); + eq[13][i].in[0] <== in[i]; + eq[13][i].in[1] <== 46; + and[6][i] = AND(); + and[6][i].a <== states[i][1]; + and[6][i].b <== eq[13][i].out; + states[i+1][4] <== and[6][i].out; + state_changed[i].in[3] <== states[i+1][4]; + eq[14][i] = IsEqual(); + eq[14][i].in[0] <== in[i]; + eq[14][i].in[1] <== 97; + and[7][i] = AND(); + and[7][i].a <== states[i][3]; + and[7][i].b <== eq[14][i].out; + states[i+1][5] <== and[7][i].out; + state_changed[i].in[4] <== states[i+1][5]; + eq[15][i] = IsEqual(); + eq[15][i].in[0] <== in[i]; + eq[15][i].in[1] <== 105; + and[8][i] = AND(); + and[8][i].a <== states[i][5]; + and[8][i].b <== eq[15][i].out; + states[i+1][6] <== and[8][i].out; + state_changed[i].in[5] <== states[i+1][6]; + eq[16][i] = IsEqual(); + eq[16][i].in[0] <== in[i]; + eq[16][i].in[1] <== 108; + and[9][i] = AND(); + and[9][i].a <== states[i][6]; + and[9][i].b <== eq[16][i].out; + states[i+1][7] <== and[9][i].out; + state_changed[i].in[6] <== states[i+1][7]; + eq[17][i] = IsEqual(); + eq[17][i].in[0] <== in[i]; + eq[17][i].in[1] <== 32; + and[10][i] = AND(); + and[10][i].a <== states[i][7]; + and[10][i].b <== eq[17][i].out; + states[i+1][8] <== and[10][i].out; + state_changed[i].in[7] <== states[i+1][8]; + eq[18][i] = IsEqual(); + eq[18][i].in[0] <== in[i]; + eq[18][i].in[1] <== 119; + and[11][i] = AND(); + and[11][i].a <== states[i][8]; + and[11][i].b <== eq[18][i].out; + states[i+1][9] <== and[11][i].out; + state_changed[i].in[8] <== states[i+1][9]; + and[12][i] = AND(); + and[12][i].a <== states[i][9]; + and[12][i].b <== eq[14][i].out; + states[i+1][10] <== and[12][i].out; + state_changed[i].in[9] <== states[i+1][10]; + eq[19][i] = IsEqual(); + eq[19][i].in[0] <== in[i]; + eq[19][i].in[1] <== 115; + and[13][i] = AND(); + and[13][i].a <== states[i][10]; + and[13][i].b <== eq[19][i].out; + states[i+1][11] <== and[13][i].out; + state_changed[i].in[10] <== states[i+1][11]; + and[14][i] = AND(); + and[14][i].a <== states[i][11]; + and[14][i].b <== eq[17][i].out; + states[i+1][12] <== and[14][i].out; + state_changed[i].in[11] <== states[i+1][12]; + and[15][i] = AND(); + and[15][i].a <== states[i][12]; + and[15][i].b <== eq[12][i].out; + states[i+1][13] <== and[15][i].out; + state_changed[i].in[12] <== states[i+1][13]; + and[16][i] = AND(); + and[16][i].a <== states[i][13]; + and[16][i].b <== eq[11][i].out; + states[i+1][14] <== and[16][i].out; + state_changed[i].in[13] <== states[i+1][14]; + and[17][i] = AND(); + and[17][i].a <== states[i][14]; + and[17][i].b <== eq[14][i].out; + states[i+1][15] <== and[17][i].out; + state_changed[i].in[14] <== states[i+1][15]; + eq[20][i] = IsEqual(); + eq[20][i].in[0] <== in[i]; + eq[20][i].in[1] <== 110; + and[18][i] = AND(); + and[18][i].a <== states[i][15]; + and[18][i].b <== eq[20][i].out; + states[i+1][16] <== and[18][i].out; + state_changed[i].in[15] <== states[i+1][16]; + eq[21][i] = IsEqual(); + eq[21][i].in[0] <== in[i]; + eq[21][i].in[1] <== 116; + and[19][i] = AND(); + and[19][i].a <== states[i][16]; + and[19][i].b <== eq[21][i].out; + states[i+1][17] <== and[19][i].out; + state_changed[i].in[16] <== states[i+1][17]; + and[20][i] = AND(); + and[20][i].a <== states[i][17]; + and[20][i].b <== eq[17][i].out; + states[i+1][18] <== and[20][i].out; + state_changed[i].in[17] <== states[i+1][18]; + eq[22][i] = IsEqual(); + eq[22][i].in[0] <== in[i]; + eq[22][i].in[1] <== 102; + and[21][i] = AND(); + and[21][i].a <== states[i][18]; + and[21][i].b <== eq[22][i].out; + states[i+1][19] <== and[21][i].out; + state_changed[i].in[18] <== states[i+1][19]; + eq[23][i] = IsEqual(); + eq[23][i].in[0] <== in[i]; + eq[23][i].in[1] <== 111; + and[22][i] = AND(); + and[22][i].a <== states[i][19]; + and[22][i].b <== eq[23][i].out; + states[i+1][20] <== and[22][i].out; + state_changed[i].in[19] <== states[i+1][20]; + eq[24][i] = IsEqual(); + eq[24][i].in[0] <== in[i]; + eq[24][i].in[1] <== 114; + and[23][i] = AND(); + and[23][i].a <== states[i][20]; + and[23][i].b <== eq[24][i].out; + states[i+1][21] <== and[23][i].out; + state_changed[i].in[20] <== states[i+1][21]; + and[24][i] = AND(); + and[24][i].a <== states[i][21]; + and[24][i].b <== eq[17][i].out; + states[i+1][22] <== and[24][i].out; + state_changed[i].in[21] <== states[i+1][22]; + eq[25][i] = IsEqual(); + eq[25][i].in[0] <== in[i]; + eq[25][i].in[1] <== 64; + and[25][i] = AND(); + and[25][i].a <== states[i][22]; + and[25][i].b <== eq[25][i].out; + states[i+1][23] <== and[25][i].out; + state_changed[i].in[22] <== states[i+1][23]; + states[i+1][0] <== 1 - state_changed[i].out; + } + + component final_state_result = MultiOR(num_bytes+1); + for (var i = 0; i <= num_bytes; i++) { + final_state_result.in[i] <== states[i][4]; + } + out <== final_state_result.out; + + signal is_consecutive[msg_bytes+1][2]; + is_consecutive[msg_bytes][1] <== 1; + for (var i = 0; i < msg_bytes; i++) { + is_consecutive[msg_bytes-1-i][0] <== states[num_bytes-i][4] * (1 - is_consecutive[msg_bytes-i][1]) + is_consecutive[msg_bytes-i][1]; + is_consecutive[msg_bytes-1-i][1] <== state_changed[msg_bytes-i].out * is_consecutive[msg_bytes-1-i][0]; + } + // substrings calculated: [{(1, 1), (23, 1)}] + signal is_substr0[msg_bytes][3]; + signal is_reveal0[msg_bytes]; + signal output reveal0[msg_bytes]; + for (var i = 0; i < msg_bytes; i++) { + is_substr0[i][0] <== 0; + is_substr0[i][1] <== is_substr0[i][0] + states[i+1][1] * states[i+2][1]; + is_substr0[i][2] <== is_substr0[i][1] + states[i+1][23] * states[i+2][1]; + is_reveal0[i] <== is_substr0[i][2] * is_consecutive[i][1]; + reveal0[i] <== in[i+1] * is_reveal0[i]; + } +} \ No newline at end of file diff --git a/packages/circom/tests/circuits/simple_regex_decomposed.json b/packages/circom/tests/circuits/simple_regex_decomposed.json new file mode 100644 index 0000000..7447ee4 --- /dev/null +++ b/packages/circom/tests/circuits/simple_regex_decomposed.json @@ -0,0 +1,16 @@ +{ + "parts":[ + { + "is_public": false, + "regex_def": "email was meant for @" + }, + { + "is_public": true, + "regex_def": "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_)+" + }, + { + "is_public": false, + "regex_def": "." + } + ] +} \ No newline at end of file diff --git a/packages/circom/tests/circuits/test_simple_regex.circom b/packages/circom/tests/circuits/test_simple_regex.circom index 2d3cda4..e5563e1 100644 --- a/packages/circom/tests/circuits/test_simple_regex.circom +++ b/packages/circom/tests/circuits/test_simple_regex.circom @@ -1,3 +1,4 @@ +pragma circom 2.1.5; include "./simple_regex.circom"; // 1=(a|b) (2=(b|c)+ )+d component main = SimpleRegex(64); \ No newline at end of file diff --git a/packages/circom/tests/circuits/test_simple_regex_decomposed.circom b/packages/circom/tests/circuits/test_simple_regex_decomposed.circom new file mode 100644 index 0000000..6f1f3b3 --- /dev/null +++ b/packages/circom/tests/circuits/test_simple_regex_decomposed.circom @@ -0,0 +1,5 @@ +pragma circom 2.1.5; + +include "./simple_regex_decomposed.circom"; +// email was meant for @[a-zA-Z0-9_]+\. +component main = SimpleRegexDecomposed(64); \ No newline at end of file diff --git a/packages/circom/tests/simple_regex.test.ts b/packages/circom/tests/simple_regex.test.ts index 2d9d72c..e86d6f4 100644 --- a/packages/circom/tests/simple_regex.test.ts +++ b/packages/circom/tests/simple_regex.test.ts @@ -79,6 +79,7 @@ describe("Simple Regex", () => { const circuitInputs = { msg: paddedStr, }; + // const circuit = await wasm_tester(path.join(__dirname, "./circuits/test_simple_regex.circom"), option); const witness = await circuit.calculateWitness(circuitInputs); await circuit.checkConstraints(witness); @@ -94,6 +95,4 @@ describe("Simple Regex", () => { } } }); - - -}); \ No newline at end of file +}); diff --git a/packages/circom/tests/simple_regex_decomposed.test.ts b/packages/circom/tests/simple_regex_decomposed.test.ts new file mode 100644 index 0000000..cec6f1f --- /dev/null +++ b/packages/circom/tests/simple_regex_decomposed.test.ts @@ -0,0 +1,48 @@ +const ff = require('ffjavascript'); +const stringifyBigInts = ff.utils.stringifyBigInts; +const circom_tester = require("circom_tester"); +const wasm_tester = circom_tester.wasm; +import * as path from "path"; +const p = "21888242871839275222246405745257275088548364400416034343698204186575808495617"; +const field = new ff.F1Field(p); +const apis = require("../../apis"); +const option = { + include: path.join(__dirname, "../../../node_modules") +}; +const compiler = require("../../compiler"); + +jest.setTimeout(120000); +describe("Simple Regex Decomposed", () => { + let circuit; + beforeAll(async () => { + compiler.genFromDecomposed(path.join(__dirname, "./circuits/simple_regex_decomposed.json"), { + circomFilePath: path.join(__dirname, "./circuits/simple_regex_decomposed.circom"), + templateName: "SimpleRegexDecomposed", + genSubstrs: true + }); + circuit = await wasm_tester(path.join(__dirname, "./circuits/test_simple_regex_decomposed.circom"), option); + }); + + it("case 1", async () => { + const input = "email was meant for @zkRegex."; + const paddedStr = apis.padString(input, 64); + const circuitInputs = { + msg: paddedStr, + }; + const witness = await circuit.calculateWitness(circuitInputs); + await circuit.checkConstraints(witness); + // console.log(witness); + expect(1n).toEqual(witness[1]); + const revealedIdx = [[21,22,23,24,25,26,27]]; + for (let substr_idx = 0; substr_idx < 1; ++substr_idx) { + for (let idx = 0; idx < 64; ++idx) { + if (revealedIdx[substr_idx].includes(idx)) { + expect(BigInt(paddedStr[idx])).toEqual(witness[2 + 64 * substr_idx + idx]); + } else { + expect(0n).toEqual(witness[2 + 64 * substr_idx + idx]); + } + } + } + }); + +}); \ No newline at end of file diff --git a/packages/compiler/src/circom.rs b/packages/compiler/src/circom.rs index 3d7e637..ef0a820 100644 --- a/packages/compiler/src/circom.rs +++ b/packages/compiler/src/circom.rs @@ -21,8 +21,7 @@ impl RegexAndDFA { template_name: &str, gen_substrs: bool, ) -> Result<(), CompilerError> { - // let all_regex = String::new(); - let circom = gen_circom_allstr(&self.dfa_val, template_name)?; + let circom = gen_circom_allstr(&self.dfa_val, template_name, &self.regex_str)?; if gen_substrs { self.add_substrs_constraints(circom_path, circom)?; } else { @@ -49,6 +48,7 @@ impl RegexAndDFA { circom += "\t}\n"; let substr_defs_array = &self.substrs_defs.substr_defs_array; + circom += &format!("\t// substrings calculated: {:?}\n", &self.substrs_defs.substr_defs_array); for (idx, defs) in substr_defs_array.into_iter().enumerate() { let num_defs = defs.len(); circom += &format!("\tsignal is_substr{}[msg_bytes][{}];\n", idx, num_defs + 1); diff --git a/packages/compiler/src/gen_circom.ts b/packages/compiler/src/gen_circom.ts index b06854c..164a693 100644 --- a/packages/compiler/src/gen_circom.ts +++ b/packages/compiler/src/gen_circom.ts @@ -3,323 +3,354 @@ type Graph = { edges: Record; }[]; -function genCircomAllstr(graph_json: Graph, template_name: string): string { - const N: number = graph_json.length; - // console.log(JSON.stringify(graph_json, null, 2)); - // const graph = Array(N).fill({}); - const rev_graph: Record> = {}; - const to_init_graph: number[][] = []; - let init_going_state: number | null = null; - - for (let i = 0; i < N; i++) { - rev_graph[i] = {}; - to_init_graph.push([]); - } - - const accept_nodes: Set = new Set(); - for (let i = 0; i < N; i++) { - const node = graph_json[i]; - for (let k in node.edges) { - const v: number = node.edges[k]; - rev_graph[v][i] = Array.from(JSON.parse(k)).map(c => (c as string).charCodeAt(0)); - if (i === 0) { - const index = rev_graph[v][i].indexOf(94); - if (index !== -1) { - init_going_state = v; - rev_graph[v][i][index] = 255; - } - for (let j = 0; j < rev_graph[v][i].length; j++) { - if (rev_graph[v][i][j] == 255) { - continue; - } - to_init_graph[v].push(rev_graph[v][i][j]); - } - } +function genCircomAllstr( + graph_json: Graph, + template_name: string, + regex_str = "" +): string { + /** + * This function generates a Circom circuit from a given graph_json, template_name, and regex_str. + * @param {Object} graph_json - The graph in JSON format. + * @param {string} template_name - The name to be used for the Circom template. + * @param {string} regex_str - The regular expression string, used only to print in a comment at the top. + */ + + const N = graph_json.length; + // console.log(JSON.stringify(graph_json, null, 2)); + // const graph = Array(N).fill({}); + const rev_graph: Record> = {}; + const to_init_graph: number[][] = []; + let init_going_state: number | null = null; + + for (let i = 0; i < N; i++) { + rev_graph[i] = {}; + to_init_graph.push([]); + } + + const accept_nodes: Set = new Set(); + for (let i = 0; i < N; i++) { + const node = graph_json[i]; + for (let k in node.edges) { + const v: number = node.edges[k]; + rev_graph[v][i] = Array.from(JSON.parse(k)).map((c) => + (c as string).charCodeAt(0) + ); + if (i === 0) { + const index = rev_graph[v][i].indexOf(94); + if (index !== -1) { + init_going_state = v; + rev_graph[v][i][index] = 255; } - if (node.type == "accept") { - accept_nodes.add(i); - } - } - - if (init_going_state !== null) { - for (const [going_state, chars] of Object.entries(to_init_graph)) { - const going_state_num = Number(going_state); - if (chars.length === 0) { - continue; - } - if (rev_graph[going_state_num][init_going_state] == null) { - rev_graph[going_state_num][init_going_state] = []; - } - rev_graph[going_state_num][init_going_state] = rev_graph[going_state_num][init_going_state].concat(chars); - + for (let j = 0; j < rev_graph[v][i].length; j++) { + if (rev_graph[v][i][j] == 255) { + continue; + } + to_init_graph[v].push(rev_graph[v][i][j]); } + } } - - if (accept_nodes.size === 0) { - throw new Error("accept node must exist"); + if (node.type == "accept") { + accept_nodes.add(i); } - const accept_nodes_array = [...accept_nodes]; - if (accept_nodes_array.length !== 1) { - throw new Error("the size of accept nodes must be one"); + } + + if (init_going_state !== null) { + for (const [going_state, chars] of Object.entries(to_init_graph)) { + const going_state_num = Number(going_state); + if (chars.length === 0) { + continue; + } + if (rev_graph[going_state_num][init_going_state] == null) { + rev_graph[going_state_num][init_going_state] = []; + } + rev_graph[going_state_num][init_going_state] = + rev_graph[going_state_num][init_going_state].concat(chars); } - - let eq_i: number = 0; - let lt_i: number = 0; - let and_i: number = 0; - let multi_or_i: number = 0; - - const range_checks: number[][][] = new Array(256); - for (let i = 0; i < 256; i++) { - range_checks[i] = new Array(256); - } - const eq_checks: number[] = new Array(256); - const multi_or_checks1: Record = {}; - const multi_or_checks2: Record = {}; - - let lines: string[] = []; - lines.push(`\tfor (var i = 0; i < num_bytes; i++) {`); - - // const uppercase = new Set(Array.from("ABCDEFGHIJKLMNOPQRSTUVWXYZ").map(c => c.charCodeAt())); - // const lowercase = new Set(Array.from("abcdefghijklmnopqrstuvwxyz").map(c => c.charCodeAt())); - // const digits = new Set(Array.from("0123456789").map(c => c.charCodeAt())); - // const symbols1 = new Set([":", ";", "<", "=", ">", "?", "@"].map(c => c.charCodeAt())); - // const symbols2 = new Set(["[", "\\", "]", "^", "_", "`"].map(c => c.charCodeAt())); - // const symbols3 = new Set(["{", "|", "}", "~"].map(c => c.charCodeAt())); - lines.push(`\t\tstate_changed[i] = MultiOR(${N - 1});`); - - for (let i = 1; i < N; i++) { - const outputs: number[] = []; - // let is_negates = []; - for (const prev_i of Object.keys(rev_graph[i])) { - const prev_i_num = Number(prev_i); - const k = rev_graph[i][prev_i_num]; - k.sort((a, b) => Number(a) - Number(b)); - const eq_outputs: [string, number][] = []; - let vals: Set = new Set(k); - // let is_negate = false; - // if (vals.has(0xff)) { - // vals.delete(0xff); - // is_negate = true; - // } - if (vals.size === 0) { - continue; - } - // if (is_negate === true) { - // for (let another_i = 1; another_i < N; another_i++) { - // if (i === another_i) { - // continue; - // } - // if (rev_graph[another_i][prev_i] === null) { - // continue; - // } - // const another_vals = new Set(rev_graph[another_i][prev_i]); - // if (another_vals.size === 0) { - // continue; - // } - // for (let another_val of another_vals) { - // vals.add(another_val); - // } - // } - // } - const min_maxes: [number, number][] = []; - let cur_min: number = k[0]; - let cur_max: number = k[0]; - - for (let idx = 1; idx < k.length; ++idx) { - if (cur_max === k[idx]) { - continue; - } - else if (cur_max + 1 === k[idx]) { - cur_max += 1; - } else { - if (cur_max - cur_min >= 16) { - min_maxes.push([cur_min, cur_max]); - } - cur_min = k[idx]; - cur_max = k[idx]; - } - } - - if (cur_max - cur_min >= 16) { - min_maxes.push([cur_min, cur_max]); - } - for (const min_max of min_maxes) { - for (let code = min_max[0]; code <= min_max[1]; ++code) { - vals.delete(code); - } - } - - // for (let subsets of [ - // [digits, 47, 58], - // [symbols1, 57, 65], - // [uppercase, 64, 91], - // [symbols2, 90, 97], - // [lowercase, 96, 123], - // [symbols3, 122, 127] - // ]) { - // const subset = subsets[0]; - // const min = subsets[1]; - // const max = subsets[2]; - // if (vals.isSuperset(subset)) { - // vals.difference(subset); - // if (min_maxs.length == 0) { - // min_maxs.push([min, max]); - // } else { - // const last = min_maxs[min_maxs.length - 1]; - // if (last[1] - 1 == min) { - // min_maxs[min_maxs.length - 1][1] = max; - // } else { - // min_maxs.push([min, max]); - // } - // } - // } - // } - - for (let min_max of min_maxes) { - const min: number = min_max[0]; - const max: number = min_max[1]; - if (range_checks[min][max] === undefined) { - lines.push(`\t\tlt[${lt_i}][i] = LessEqThan(8);`); - lines.push(`\t\tlt[${lt_i}][i].in[0] <== ${min};`); - lines.push(`\t\tlt[${lt_i}][i].in[1] <== in[i];`); - - lines.push(`\t\tlt[${lt_i + 1}][i] = LessEqThan(8);`); - lines.push(`\t\tlt[${lt_i + 1}][i].in[0] <== in[i];`); - lines.push(`\t\tlt[${lt_i + 1}][i].in[1] <== ${max};`); - - lines.push(`\t\tand[${and_i}][i] = AND();`); - lines.push(`\t\tand[${and_i}][i].a <== lt[${lt_i}][i].out;`); - lines.push(`\t\tand[${and_i}][i].b <== lt[${lt_i + 1}][i].out;`); - - eq_outputs.push(['and', and_i]); - range_checks[min][max] = [lt_i, and_i]; - lt_i += 2; - and_i += 1; - } else { - let [_, and_i] = range_checks[min][max]; - eq_outputs.push(['and', and_i]); - } - - } - for (let code of vals) { - if (eq_checks[code] === undefined) { - lines.push(`\t\teq[${eq_i}][i] = IsEqual();`); - lines.push(`\t\teq[${eq_i}][i].in[0] <== in[i];`); - lines.push(`\t\teq[${eq_i}][i].in[1] <== ${code};`); - eq_outputs.push(['eq', eq_i]); - eq_checks[code] = eq_i; - eq_i += 1; - } else { - eq_outputs.push(['eq', eq_checks[code]]); - } - } - - lines.push(`\t\tand[${and_i}][i] = AND();`); - lines.push(`\t\tand[${and_i}][i].a <== states[i][${prev_i}];`); - if (eq_outputs.length === 1) { - // if (is_negate) { - // lines.push(`\t\tand[${and_i}][i].b <== 1 - ${eq_outputs[0][0]}[${eq_outputs[0][1]}][i].out;`); - // } else { - // lines.push(`\t\tand[${and_i}][i].b <== ${eq_outputs[0][0]}[${eq_outputs[0][1]}][i].out;`); - // } - lines.push(`\t\tand[${and_i}][i].b <== ${eq_outputs[0][0]}[${eq_outputs[0][1]}][i].out;`); - } else if (eq_outputs.length > 1) { - const eq_outputs_key: string = JSON.stringify(eq_outputs); - if (multi_or_checks1[eq_outputs_key] === undefined) { - lines.push(`\t\tmulti_or[${multi_or_i}][i] = MultiOR(${eq_outputs.length});`); - for (let output_i = 0; output_i < eq_outputs.length; output_i++) { - lines.push(`\t\tmulti_or[${multi_or_i}][i].in[${output_i}] <== ${eq_outputs[output_i][0]}[${eq_outputs[output_i][1]}][i].out;`); - } - // if (is_negate) { - // lines.push(`\t\tand[${and_i}][i].b <== 1 - multi_or[${multi_or_i}][i].out;`); - // } else { - // lines.push(`\t\tand[${and_i}][i].b <== multi_or[${multi_or_i}][i].out;`); - // } - lines.push(`\t\tand[${and_i}][i].b <== multi_or[${multi_or_i}][i].out;`); - multi_or_checks1[eq_outputs_key] = multi_or_i; - multi_or_i += 1 - } else { - lines.push(`\t\tand[${and_i}][i].b <== multi_or[${multi_or_checks1[eq_outputs_key]}][i].out;`); - } - } - - outputs.push(and_i); - and_i += 1; + } + + if (accept_nodes.size === 0) { + throw new Error("accept node must exist"); + } + const accept_nodes_array = [...accept_nodes]; + if (accept_nodes_array.length !== 1) { + throw new Error("the size of accept nodes must be one"); + } + + let eq_i: number = 0; + let lt_i: number = 0; + let and_i: number = 0; + let multi_or_i: number = 0; + + const range_checks: number[][][] = new Array(256); + for (let i = 0; i < 256; i++) { + range_checks[i] = new Array(256); + } + const eq_checks: number[] = new Array(256); + const multi_or_checks1: Record = {}; + const multi_or_checks2: Record = {}; + + let lines: string[] = []; + lines.push(`\tfor (var i = 0; i < num_bytes; i++) {`); + + // const uppercase = new Set(Array.from("ABCDEFGHIJKLMNOPQRSTUVWXYZ").map(c => c.charCodeAt())); + // const lowercase = new Set(Array.from("abcdefghijklmnopqrstuvwxyz").map(c => c.charCodeAt())); + // const digits = new Set(Array.from("0123456789").map(c => c.charCodeAt())); + // const symbols1 = new Set([":", ";", "<", "=", ">", "?", "@"].map(c => c.charCodeAt())); + // const symbols2 = new Set(["[", "\\", "]", "^", "_", "`"].map(c => c.charCodeAt())); + // const symbols3 = new Set(["{", "|", "}", "~"].map(c => c.charCodeAt())); + lines.push(`\t\tstate_changed[i] = MultiOR(${N - 1});`); + + for (let i = 1; i < N; i++) { + const outputs: number[] = []; + // let is_negates = []; + for (const prev_i of Object.keys(rev_graph[i])) { + const prev_i_num = Number(prev_i); + const k = rev_graph[i][prev_i_num]; + k.sort((a, b) => Number(a) - Number(b)); + const eq_outputs: [string, number][] = []; + let vals: Set = new Set(k); + // let is_negate = false; + // if (vals.has(0xff)) { + // vals.delete(0xff); + // is_negate = true; + // } + if (vals.size === 0) { + continue; + } + // if (is_negate === true) { + // for (let another_i = 1; another_i < N; another_i++) { + // if (i === another_i) { + // continue; + // } + // if (rev_graph[another_i][prev_i] === null) { + // continue; + // } + // const another_vals = new Set(rev_graph[another_i][prev_i]); + // if (another_vals.size === 0) { + // continue; + // } + // for (let another_val of another_vals) { + // vals.add(another_val); + // } + // } + // } + const min_maxes: [number, number][] = []; + let cur_min: number = k[0]; + let cur_max: number = k[0]; + + for (let idx = 1; idx < k.length; ++idx) { + if (cur_max === k[idx]) { + continue; + } else if (cur_max + 1 === k[idx]) { + cur_max += 1; + } else { + if (cur_max - cur_min >= 16) { + min_maxes.push([cur_min, cur_max]); + } + cur_min = k[idx]; + cur_max = k[idx]; } - - if (outputs.length === 1) { - lines.push(`\t\tstates[i+1][${i}] <== and[${outputs[0]}][i].out;`); - } else if (outputs.length > 1) { - const outputs_key: string = JSON.stringify(outputs); - if (multi_or_checks2[outputs_key] === undefined) { - lines.push(`\t\tmulti_or[${multi_or_i}][i] = MultiOR(${outputs.length});`); - for (let output_i = 0; output_i < outputs.length; output_i++) { - lines.push(`\t\tmulti_or[${multi_or_i}][i].in[${output_i}] <== and[${outputs[output_i]}][i].out;`); - } - lines.push(`\t\tstates[i+1][${i}] <== multi_or[${multi_or_i}][i].out;`); - multi_or_checks2[outputs_key] = multi_or_i; - multi_or_i += 1; - } else { - lines.push(`\t\tstates[i+1][${i}] <== multi_or[${multi_or_checks2[outputs_key]}][i].out;`); - } + } + + if (cur_max - cur_min >= 16) { + min_maxes.push([cur_min, cur_max]); + } + for (const min_max of min_maxes) { + for (let code = min_max[0]; code <= min_max[1]; ++code) { + vals.delete(code); + } + } + + // for (let subsets of [ + // [digits, 47, 58], + // [symbols1, 57, 65], + // [uppercase, 64, 91], + // [symbols2, 90, 97], + // [lowercase, 96, 123], + // [symbols3, 122, 127] + // ]) { + // const subset = subsets[0]; + // const min = subsets[1]; + // const max = subsets[2]; + // if (vals.isSuperset(subset)) { + // vals.difference(subset); + // if (min_maxs.length == 0) { + // min_maxs.push([min, max]); + // } else { + // const last = min_maxs[min_maxs.length - 1]; + // if (last[1] - 1 == min) { + // min_maxs[min_maxs.length - 1][1] = max; + // } else { + // min_maxs.push([min, max]); + // } + // } + // } + // } + + for (let min_max of min_maxes) { + const min: number = min_max[0]; + const max: number = min_max[1]; + if (range_checks[min][max] === undefined) { + lines.push(`\t\tlt[${lt_i}][i] = LessEqThan(8);`); + lines.push(`\t\tlt[${lt_i}][i].in[0] <== ${min};`); + lines.push(`\t\tlt[${lt_i}][i].in[1] <== in[i];`); + + lines.push(`\t\tlt[${lt_i + 1}][i] = LessEqThan(8);`); + lines.push(`\t\tlt[${lt_i + 1}][i].in[0] <== in[i];`); + lines.push(`\t\tlt[${lt_i + 1}][i].in[1] <== ${max};`); + + lines.push(`\t\tand[${and_i}][i] = AND();`); + lines.push(`\t\tand[${and_i}][i].a <== lt[${lt_i}][i].out;`); + lines.push(`\t\tand[${and_i}][i].b <== lt[${lt_i + 1}][i].out;`); + + eq_outputs.push(["and", and_i]); + range_checks[min][max] = [lt_i, and_i]; + lt_i += 2; + and_i += 1; + } else { + let [_, and_i] = range_checks[min][max]; + eq_outputs.push(["and", and_i]); + } + } + for (let code of vals) { + if (eq_checks[code] === undefined) { + lines.push(`\t\teq[${eq_i}][i] = IsEqual();`); + lines.push(`\t\teq[${eq_i}][i].in[0] <== in[i];`); + lines.push(`\t\teq[${eq_i}][i].in[1] <== ${code};`); + eq_outputs.push(["eq", eq_i]); + eq_checks[code] = eq_i; + eq_i += 1; + } else { + eq_outputs.push(["eq", eq_checks[code]]); + } + } + + lines.push(`\t\tand[${and_i}][i] = AND();`); + lines.push(`\t\tand[${and_i}][i].a <== states[i][${prev_i}];`); + if (eq_outputs.length === 1) { + // if (is_negate) { + // lines.push(`\t\tand[${and_i}][i].b <== 1 - ${eq_outputs[0][0]}[${eq_outputs[0][1]}][i].out;`); + // } else { + // lines.push(`\t\tand[${and_i}][i].b <== ${eq_outputs[0][0]}[${eq_outputs[0][1]}][i].out;`); + // } + lines.push( + `\t\tand[${and_i}][i].b <== ${eq_outputs[0][0]}[${eq_outputs[0][1]}][i].out;` + ); + } else if (eq_outputs.length > 1) { + const eq_outputs_key: string = JSON.stringify(eq_outputs); + if (multi_or_checks1[eq_outputs_key] === undefined) { + lines.push( + `\t\tmulti_or[${multi_or_i}][i] = MultiOR(${eq_outputs.length});` + ); + for (let output_i = 0; output_i < eq_outputs.length; output_i++) { + lines.push( + `\t\tmulti_or[${multi_or_i}][i].in[${output_i}] <== ${eq_outputs[output_i][0]}[${eq_outputs[output_i][1]}][i].out;` + ); + } + // if (is_negate) { + // lines.push(`\t\tand[${and_i}][i].b <== 1 - multi_or[${multi_or_i}][i].out;`); + // } else { + // lines.push(`\t\tand[${and_i}][i].b <== multi_or[${multi_or_i}][i].out;`); + // } + lines.push( + `\t\tand[${and_i}][i].b <== multi_or[${multi_or_i}][i].out;` + ); + multi_or_checks1[eq_outputs_key] = multi_or_i; + multi_or_i += 1; + } else { + lines.push( + `\t\tand[${and_i}][i].b <== multi_or[${multi_or_checks1[eq_outputs_key]}][i].out;` + ); } + } - lines.push(`\t\tstate_changed[i].in[${i - 1}] <== states[i+1][${i}];`); + outputs.push(and_i); + and_i += 1; } - lines.push(`\t\tstates[i+1][0] <== 1 - state_changed[i].out;`); - lines.push(`\t}`); - - - const declarations: string[] = []; - declarations.push(`pragma circom 2.1.5;\n`); - declarations.push(`include "@zk-email/zk-regex-circom/circuits/regex_helpers.circom";\n`); - // declarations.push(`pragma circom 2.1.5;\ninclude "@zk-email/circuits/regexes/regex_helpers.circom";\n`); - declarations.push(`template ${template_name}(msg_bytes) {`); - declarations.push(`\tsignal input msg[msg_bytes];`); - declarations.push(`\tsignal output out;\n`); - declarations.push(`\tvar num_bytes = msg_bytes+1;`); - declarations.push(`\tsignal in[num_bytes];`); - declarations.push(`\tin[0]<==255;`); - declarations.push(`\tfor (var i = 0; i < msg_bytes; i++) {`); - declarations.push(`\t\tin[i+1] <== msg[i];`); - declarations.push(`\t}\n`); - if (eq_i > 0) { - declarations.push(`\tcomponent eq[${eq_i}][num_bytes];`); - } - if (lt_i > 0) { - declarations.push(`\tcomponent lt[${lt_i}][num_bytes];`); - } - if (and_i > 0) { - declarations.push(`\tcomponent and[${and_i}][num_bytes];`); - } - if (multi_or_i > 0) { - declarations.push(`\tcomponent multi_or[${multi_or_i}][num_bytes];`); + if (outputs.length === 1) { + lines.push(`\t\tstates[i+1][${i}] <== and[${outputs[0]}][i].out;`); + } else if (outputs.length > 1) { + const outputs_key: string = JSON.stringify(outputs); + if (multi_or_checks2[outputs_key] === undefined) { + lines.push( + `\t\tmulti_or[${multi_or_i}][i] = MultiOR(${outputs.length});` + ); + for (let output_i = 0; output_i < outputs.length; output_i++) { + lines.push( + `\t\tmulti_or[${multi_or_i}][i].in[${output_i}] <== and[${outputs[output_i]}][i].out;` + ); + } + lines.push(`\t\tstates[i+1][${i}] <== multi_or[${multi_or_i}][i].out;`); + multi_or_checks2[outputs_key] = multi_or_i; + multi_or_i += 1; + } else { + lines.push( + `\t\tstates[i+1][${i}] <== multi_or[${multi_or_checks2[outputs_key]}][i].out;` + ); + } } - declarations.push(`\tsignal states[num_bytes+1][${N}];`); - declarations.push(`\tcomponent state_changed[num_bytes];`); - declarations.push(""); - - const init_code: string[] = []; - init_code.push(`\tstates[0][0] <== 1;`); - init_code.push(`\tfor (var i = 1; i < ${N}; i++) {`); - init_code.push(`\t\tstates[0][i] <== 0;`); - init_code.push("\t}"); - init_code.push(""); - - lines = declarations.concat(init_code).concat(lines); - - const accept_node: number = accept_nodes_array[0]; - const accept_lines = [""]; - accept_lines.push("\tcomponent final_state_result = MultiOR(num_bytes+1);"); - accept_lines.push("\tfor (var i = 0; i <= num_bytes; i++) {"); - accept_lines.push(`\t\tfinal_state_result.in[i] <== states[i][${accept_node}];`); - accept_lines.push("\t}"); - accept_lines.push("\tout <== final_state_result.out;"); - lines = lines.concat(accept_lines); - let string: string = lines.reduce((res, line) => res + line + "\n", ""); - return string; + lines.push(`\t\tstate_changed[i].in[${i - 1}] <== states[i+1][${i}];`); + } + + lines.push(`\t\tstates[i+1][0] <== 1 - state_changed[i].out;`); + lines.push("\t}"); + + const declarations: string[] = []; + declarations.push(`pragma circom 2.1.5;\n`); + declarations.push( + `include "@zk-email/zk-regex-circom/circuits/regex_helpers.circom";\n` + ); + // declarations.push(`pragma circom 2.1.5;\ninclude "@zk-email/circuits/regexes/regex_helpers.circom";\n`); + declarations.push(`// regex: ${regex_str.replace(/\n/g, "\\n")}`); + declarations.push(`template ${template_name}(msg_bytes) {`); + declarations.push(`\tsignal input msg[msg_bytes];`); + declarations.push(`\tsignal output out;\n`); + declarations.push(`\tvar num_bytes = msg_bytes+1;`); + declarations.push(`\tsignal in[num_bytes];`); + declarations.push(`\tin[0]<==255;`); + declarations.push(`\tfor (var i = 0; i < msg_bytes; i++) {`); + declarations.push(`\t\tin[i+1] <== msg[i];`); + declarations.push(`\t}\n`); + if (eq_i > 0) { + declarations.push(`\tcomponent eq[${eq_i}][num_bytes];`); + } + if (lt_i > 0) { + declarations.push(`\tcomponent lt[${lt_i}][num_bytes];`); + } + if (and_i > 0) { + declarations.push(`\tcomponent and[${and_i}][num_bytes];`); + } + if (multi_or_i > 0) { + declarations.push(`\tcomponent multi_or[${multi_or_i}][num_bytes];`); + } + declarations.push(`\tsignal states[num_bytes+1][${N}];`); + declarations.push(`\tcomponent state_changed[num_bytes];`); + declarations.push(""); + + const init_code: string[] = []; + init_code.push(`\tstates[0][0] <== 1;`); + init_code.push(`\tfor (var i = 1; i < ${N}; i++) {`); + init_code.push(`\t\tstates[0][i] <== 0;`); + init_code.push("\t}"); + init_code.push(""); + + lines = declarations.concat(init_code).concat(lines); + + const accept_node: number = accept_nodes_array[0]; + const accept_lines = [""]; + accept_lines.push("\tcomponent final_state_result = MultiOR(num_bytes+1);"); + accept_lines.push("\tfor (var i = 0; i <= num_bytes; i++) {"); + accept_lines.push( + `\t\tfinal_state_result.in[i] <== states[i][${accept_node}];` + ); + accept_lines.push("\t}"); + accept_lines.push("\tout <== final_state_result.out;"); + + lines = lines.concat(accept_lines); + let string: string = lines.reduce((res, line) => res + line + "\n", ""); + return string; } // Commented these two out as they're only used by the code that's also commented out diff --git a/packages/compiler/src/js_caller.rs b/packages/compiler/src/js_caller.rs index e054006..5938a56 100644 --- a/packages/compiler/src/js_caller.rs +++ b/packages/compiler/src/js_caller.rs @@ -47,9 +47,9 @@ pub fn regex_to_dfa(regex: &str) -> Result, JsCallerError> { Ok(serde_json::from_str(&result)?) } -pub fn gen_circom_allstr(graph: &[Value], template_name: &str) -> Result { +pub fn gen_circom_allstr(graph: &[Value], template_name: &str, regex_str: &str) -> Result { let code: &'static str = include_str!("gen_circom.js"); let mut script = Script::from_string(code)?; - let result: String = script.call("genCircomAllstr", (graph, template_name))?; + let result: String = script.call("genCircomAllstr", (graph, template_name, regex_str))?; Ok(result) } diff --git a/packages/compiler/src/lib.rs b/packages/compiler/src/lib.rs index 057000f..a5f951e 100644 --- a/packages/compiler/src/lib.rs +++ b/packages/compiler/src/lib.rs @@ -5,6 +5,10 @@ pub mod halo2; pub mod js_caller; pub mod node; + +// #[cfg(test)] +// mod tests; + use crate::node::*; use neon; @@ -64,7 +68,8 @@ pub enum SoldityType { #[derive(Debug, Clone)] pub struct RegexAndDFA { // pub max_byte_size: usize, - // pub all_regex: String, + // Original regex string, only here to be printed in generated file to make it more reproducible + pub regex_str: String, pub dfa_val: Vec, pub substrs_defs: SubstrsDefs, } @@ -93,7 +98,7 @@ impl DecomposedRegexConfig { let substrs_defs = self.extract_substr_ids(&dfa_val)?; Ok(RegexAndDFA { // max_byte_size: self.max_byte_size, - // all_regex, + regex_str: all_regex, dfa_val, substrs_defs, }) @@ -291,8 +296,8 @@ impl DecomposedRegexConfig { let index_ends = part_regexes .iter() .map(|regex| { - // println!("regex {}", regex); - // println!("concat_str {}", concat_str); + println!("regex {}", regex); + println!("concat_str {}", concat_str); let found = regex.find(&concat_str).unwrap().unwrap(); // println!("found {:?}", found); if found.start() == found.end() { @@ -339,7 +344,7 @@ impl RegexAndDFA { Ok(RegexAndDFA { // max_byte_size, - // all_regex: regex_str.to_string(), + regex_str: regex_str.to_string(), dfa_val, substrs_defs, }) @@ -529,3 +534,26 @@ fn main(mut cx: neon::prelude::ModuleContext) -> neon::prelude::NeonResult<()> { cx.export_function("genFromRaw", gen_from_raw_node)?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use std::path::Path; + + #[test] + fn test_gen_from_decomposed() { + let decomposed_regex_path = Path::new("../circuits/common/subject_all.json"); + let circom_file_path = Some("../circuits/common/subject_all_regex.circom"); + let circom_template_name = Some("SubjectAllRegex"); + let gen_substrs = Some(true); + + let result = gen_from_decomposed( + decomposed_regex_path.to_str().unwrap(), + circom_file_path.map(|s| s), + circom_template_name.map(|s| s), + gen_substrs, + ); + + // assert!(result.is_ok()); + } +} diff --git a/packages/compiler/src/node.rs b/packages/compiler/src/node.rs index a7ceaa6..0932771 100644 --- a/packages/compiler/src/node.rs +++ b/packages/compiler/src/node.rs @@ -1,46 +1,50 @@ use crate::{gen_from_decomposed, gen_from_raw}; +use neon::context::Context; use neon::prelude::*; pub(crate) fn gen_from_decomposed_node(mut cx: FunctionContext) -> JsResult { + println!("Starting gen_from_decomposed_node function"); let decomposed_regex_path = cx.argument::(0)?.value(&mut cx); + println!("Decomposed regex path: {}", decomposed_regex_path); let obj = cx.argument::(1)?; + println!("Object: {:?}", obj); - // let halo2_dir_path = obj - // .get_opt::(&mut cx, "halo2DirPath")? - // .map(|v| { - // v.to_string(&mut cx) - // .expect("halo2DirPath must be null or string") - // .value(&mut cx) - // }); let circom_file_path = obj .get_opt::(&mut cx, "circomFilePath")? .map(|v| { - v.to_string(&mut cx) + let path = v.to_string(&mut cx) .expect("circomFilePath must be null or string") - .value(&mut cx) + .value(&mut cx); + println!("Circom file path: {}", path); + path }); let circom_template_name = obj .get_opt::(&mut cx, "templateName")? .map(|v| { - v.to_string(&mut cx) + let name = v.to_string(&mut cx) .expect("templateName must be null or string") - .value(&mut cx) + .value(&mut cx); + println!("Circom template name: {}", name); + name }); let gen_substrs = obj .get_opt::(&mut cx, "genSubstrs")? .map(|v| { - v.as_value(&mut cx) + let gen = v.as_value(&mut cx) .downcast::(&mut cx) .expect("genSubstrs must be null or boolean") - .value(&mut cx) + .value(&mut cx); + println!("Gen substrs: {}", gen); + gen }); + println!("Calling gen_from_decomposed function"); gen_from_decomposed( &decomposed_regex_path, - // halo2_dir_path.as_ref().map(|s| s.as_str()), circom_file_path.as_ref().map(|s| s.as_str()), circom_template_name.as_ref().map(|s| s.as_str()), gen_substrs, ); + println!("Finished gen_from_decomposed_node function"); Ok(cx.null()) } diff --git a/packages/compiler/src/regex.js b/packages/compiler/src/regex.js index 8293c85..2ae27fa 100644 --- a/packages/compiler/src/regex.js +++ b/packages/compiler/src/regex.js @@ -1,13 +1,129 @@ "use strict"; /* eslint-disable no-prototype-builtins */ /*jslint browser: true*/ + +const a2z_nosep = "abcdefghijklmnopqrstuvwxyz"; +const A2Z_nosep = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +const a2f_nosep = "abcdef"; +const A2F_nosep = "ABCDEF"; +const r0to9_nosep = "0123456789"; +const escapeMap = { n: "\n", r: "\r", t: "\t", v: "\v", f: "\f" }; +const whitespace = Object.values(escapeMap); +const slash_s = whitespace.join("|"); + +/** + * Parse regex to a min DFA spec + * to support some shorthands that make regex easier to write e.g. [A-Z] + */ +function regexToMinDFASpec(str) { + // Replace all A-Z with A2Z etc + let combined_nosep = str + .replaceAll("A-Z", A2Z_nosep) + .replaceAll("a-z", a2z_nosep) + .replaceAll("A-F", A2F_nosep) + .replaceAll("a-f", a2f_nosep) + .replaceAll("0-9", r0to9_nosep) + .replaceAll("\\w", A2Z_nosep + r0to9_nosep + a2z_nosep + "_") + .replaceAll("\\d", r0to9_nosep) + .replaceAll("\\s", slash_s); + // .replaceAll("\\w", A2Z_nosep + r0to9_nosep + a2z_nosep); // I think that there's also an underscore here + + function addPipeInsideBrackets(str) { + let result = ""; + let insideBrackets = false; + for (let i = 0; i < str.length; i++) { + if (str[i] === "[") { + result += str[i]; + insideBrackets = true; + continue; + } else if (str[i] === "]") { + insideBrackets = false; + } + let str_to_add = str[i]; + if (str[i] === "\\") { + i++; + str_to_add += str[i]; + } + result += insideBrackets ? "|" + str_to_add : str_to_add; + } + return result.replaceAll("[|", "[").replaceAll("[", "(").replaceAll("]", ")"); + } + + // function makeCurlyBracesFallback(str) { + // let result = ""; + // let insideBrackets = false; + // for (let i = 0; i < str.length; i++) { + // if (str[i] === "{") { + // result += str[i]; + // insideBrackets = true; + // continue; + // } else if (str[i] === "}") { + // insideBrackets = false; + // } + // result += insideBrackets ? "|" + str[i] : str[i]; + // } + // return result.replaceAll("[|", "[").replaceAll("[", "(").replaceAll("]", ")"); + // } + + function checkIfBracketsHavePipes(str) { + let result = true; + let insideBrackets = false; + let insideParens = 0; + let indexAt = 0; + for (let i = 0; i < str.length; i++) { + if (indexAt >= str.length) break; + if (str[indexAt] === "[") { + insideBrackets = true; + indexAt++; + continue; + } else if (str[indexAt] === "]") { + insideBrackets = false; + } + if (str[indexAt] === "(") { + insideParens++; + } else if (str[indexAt] === ")") { + insideParens--; + } + if (insideBrackets) { + if (str[indexAt] === "|") { + indexAt++; + } else { + result = false; + return result; + } + } + if (!insideParens && str[indexAt] === "|") { + console.log("Error: | outside of parens!"); + } + if (str[indexAt] === "\\") { + indexAt++; + } + indexAt++; + } + return result; + } + + let combined; + if (!checkIfBracketsHavePipes(combined_nosep)) { + // console.log("Adding pipes within brackets between everything!"); + combined = addPipeInsideBrackets(combined_nosep); + if (!checkIfBracketsHavePipes(combined)) { + console.log("Did not add brackets correctly!"); + } + } else { + combined = combined_nosep; + } + + return combined; +} + /** * Try parsing simple regular expression to syntax tree. * * Basic grammars: * Empty: S -> ϵ * Cat: S -> S S - * Or: S -> S | S + * Or: S -> S | S * Star: S -> S * * Text: S -> [0-9a-zA-Z] * S -> ( S ) @@ -18,242 +134,142 @@ * * @param {string} text The input regular expression * @return {string|object} Returns a string that is an error message if failed to parse the expression, - * otherwise returns an object which is the syntax tree. + * otherwise returns an object which is the syntax tree. */ function parseRegex(text) { - 'use strict'; - function parseSub(text, begin, end, first) { - var i, sub, last = 0, node = { - begin: begin, - end: end, - }, virNode, tempNode, stack = 0, parts = []; - if (text.length === 0) { - return 'Error: empty input at ' + begin + '.'; - } - if (first) { - for (i = 0; i <= text.length; i += 1) { - if (i === text.length || (text[i] === '|' && stack === 0)) { - if (last === 0 && i === text.length) { - return parseSub(text, begin + last, begin + i, false); - } - sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - if (typeof sub === 'string') { - return sub; - } - parts.push(sub); - last = i + 1; - } - else if (text[i] === '(') { - stack += 1; - } - else if (text[i] === ')') { - stack -= 1; - } - } - if (parts.length === 1) { - return parts[0]; - } - node.type = 'or'; - node.parts = parts; - } - else { - for (i = 0; i < text.length; i += 1) { - if (text[i] === '(') { - last = i + 1; - i += 1; - stack = 1; - while (i < text.length && stack !== 0) { - if (text[i] === '(') { - stack += 1; - } - else if (text[i] === ')') { - stack -= 1; - } - i += 1; - } - if (stack !== 0) { - return `Error: missing right parentheses for ${begin + last}.`; - } - i -= 1; - sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - if (typeof sub === 'string') { - return sub; - } - sub.begin -= 1; - sub.end += 1; - parts.push(sub); - // } else if (text[i] === '[') { - // last = i + 1; - // i += 1; - // if (text[i] === '^') { - // text[i] = '\u{ff}'; - // } - // stack = 1; - // while (i < text.length && stack !== 0) { - // if (text[i] === ']') { - // stack -= 1; - // } - // i += 1; - // } - // if (stack !== 0) { - // return 'Error: missing right brakets for ' + (begin + last) + '.'; - // } - // i -= 1; - // sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - // if (typeof sub === 'string') { - // return sub; - // } - // sub.begin -= 1; - // sub.end += 1; - // parts.push(sub); - } - else if (text[i] === '*') { - if (parts.length === 0) { - return `Error: unexpected * at ${begin + i}.`; - } - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = 'star'; - tempNode.sub = parts[parts.length - 1]; - parts[parts.length - 1] = tempNode; - } - else if (text[i] === '+') { - if (parts.length === 0) { - return `Error: unexpected + at ${begin + i}.`; - } - virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - virNode.type = 'star'; - virNode.sub = parts[parts.length - 1]; - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = 'cat'; - tempNode.parts = [parts[parts.length - 1], virNode]; - parts[parts.length - 1] = tempNode; - } - else if (text[i] === '?') { - if (parts.length === 0) { - return `Error: unexpected ? at ${begin + i}.`; - } - virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - virNode.type = 'empty'; - virNode.sub = parts[parts.length - 1]; - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = 'or'; - tempNode.parts = [parts[parts.length - 1], virNode]; - parts[parts.length - 1] = tempNode; - } - else if (text[i] === 'ϵ') { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = 'empty'; - parts.push(tempNode); - } - else if (Array.isArray(text[i])) { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = 'text'; - tempNode.text = text[i][0]; - parts.push(tempNode); - } - else { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = 'text'; - tempNode.text = text[i]; - parts.push(tempNode); - } - } - // console.log(`parts ${JSON.stringify(parts)}`); - if (parts.length === 1) { - return parts[0]; - } - node.type = 'cat'; - node.parts = parts; - } - return node; + text = regexToMinDFASpec(text); + "use strict"; + function parseSub(text, begin, end, first) { + var i, + sub, + last = 0, + node = { begin: begin, end: end }, + virNode, + tempNode, + stack = 0, + parts = []; + if (text.length === 0) { + return "Error: empty input at " + begin + "."; } - let char; - let new_text = []; - let i = 0; - let is_in_brancket = false; - let brancket_text = []; - while (i < text.length) { - char = text[i]; - if (text[i] == '\\') { - char = [text[i + 1]]; - // new_text.push([text[i + 1]]); - i += 1; - } - if (char === '[') { - if (is_in_brancket) { - return `Error: unexpected [ at ${i}.`; - } - is_in_brancket = true; - brancket_text = []; - // new_text.push(char); - i += 1; - } - else if (char === ']') { - if (!is_in_brancket) { - return `Error: unexpected ] at ${i}.`; - } - is_in_brancket = false; - if (brancket_text[0] === '^') { - brancket_text.shift(); - let rev_text = []; - let code_char = ''; - const brancket_text_jsons = brancket_text.map(val => JSON.stringify(val)); - for (let idx = 0; idx < 255; idx++) { - code_char = String.fromCodePoint(idx); - if ([ - '(', - ')', - '*', - '+', - '.', - '?', - '[', - '\\', - ']', - '^', - '`', - '|', - '-' - ].indexOf(code_char) != -1) { - code_char = [code_char]; - } - if (brancket_text_jsons.indexOf(JSON.stringify(code_char)) === -1) { - rev_text.push(code_char); - } - } - brancket_text = rev_text; - } - new_text.push('('); - for (const c of brancket_text) { - new_text.push(c); - new_text.push('|'); - } - new_text = new_text.slice(0, -1); - new_text.push(')'); - i += 1; - } - else if (is_in_brancket) { - if (!Array.isArray(char) && ['(', ')', '[', '*', '+', '?', 'ϵ'].includes(char)) { - return `Error: unexpected ${char} at ${i}.`; - } - if (char === '^' && text[i - 1] !== '[') { - return `Error: unexpected ^ at ${i}.`; - } - // new_text.push(char); - // new_text.push('|'); - brancket_text.push(char); - i += 1; - } - else { - new_text.push(char); - i += 1; + if (first) { + for (i = 0; i <= text.length; i += 1) { + if (i === text.length || (text[i] === "|" && stack === 0)) { + if (last === 0 && i === text.length) { + return parseSub(text, begin + last, begin + i, false); + } + sub = parseSub(text.slice(last, i), begin + last, begin + i, true); + if (typeof sub === "string") { + return sub; + } + parts.push(sub); + last = i + 1; + } else if (text[i] === "(") { + stack += 1; + } else if (text[i] === ")") { + stack -= 1; + } + } + if (parts.length === 1) { + return parts[0]; + } + node.type = "or"; + node.parts = parts; + } else { + for (i = 0; i < text.length; i += 1) { + if (text[i] === "(") { + last = i + 1; + i += 1; + stack = 1; + while (i < text.length && stack !== 0) { + if (text[i] === "(") { + stack += 1; + } else if (text[i] === ")") { + stack -= 1; } + i += 1; + } + if (stack !== 0) { + return "Error: missing right bracket for " + (begin + last) + "."; + } + i -= 1; + sub = parseSub(text.slice(last, i), begin + last, begin + i, true); + if (typeof sub === "string") { + return sub; + } + sub.begin -= 1; + sub.end += 1; + parts.push(sub); + } else if (text[i] === "*") { + if (parts.length === 0) { + return "Error: unexpected * at " + (begin + i) + "."; + } + tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + tempNode.type = "star"; + tempNode.sub = parts[parts.length - 1]; + parts[parts.length - 1] = tempNode; + } else if (text[i] === "+") { + if (parts.length === 0) { + return "Error: unexpected + at " + (begin + i) + "."; + } + virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + virNode.type = "star"; + virNode.sub = parts[parts.length - 1]; + tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + tempNode.type = "cat"; + tempNode.parts = [parts[parts.length - 1], virNode]; + parts[parts.length - 1] = tempNode; + } else if (text[i] === "?") { + if (parts.length === 0) { + return "Error: unexpected + at " + (begin + i) + "."; + } + virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + virNode.type = "empty"; + virNode.sub = parts[parts.length - 1]; + tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + tempNode.type = "or"; + tempNode.parts = [parts[parts.length - 1], virNode]; + parts[parts.length - 1] = tempNode; + } else if (text[i] === "ϵ") { + tempNode = { begin: begin + i, end: begin + i + 1 }; + tempNode.type = "empty"; + parts.push(tempNode); + } else if (Array.isArray(text[i])) { + tempNode = { begin: begin + i, end: begin + i + 1 }; + tempNode.type = "text"; + tempNode.text = text[i][0]; + parts.push(tempNode); + } else { + tempNode = { begin: begin + i, end: begin + i + 1 }; + tempNode.type = "text"; + tempNode.text = text[i]; + parts.push(tempNode); + } } - if (is_in_brancket) { - return `Error: missing right brackets.`; + if (parts.length === 1) { + return parts[0]; } - return parseSub(new_text, 0, new_text.length, true); -} + node.type = "cat"; + node.parts = parts; + } + return node; + } + + let new_text = []; + let i = 0; + while (i < text.length) { + if (text[i] === "\\") { + const escapeMap = { n: "\n", r: "\r", t: "\t", v: "\v", f: "\f", "^": String.fromCharCode(128) }; + const char = text[i + 1]; + new_text.push([escapeMap[char] || char]); + i += 2; + } else { + new_text.push(text[i]); + i += 1; + } + } + return parseSub(new_text, 0, new_text.length, true); + } + /** * Convert regular expression to nondeterministic finite automaton. * @@ -261,64 +277,59 @@ function parseRegex(text) { * @return {object|string} */ function regexToNfa(text) { - 'use strict'; - function generateGraph(node, start, end, count) { - if ('id' in start) { - start.id = count; - count += 1; - } - switch (node.type) { - case 'empty': - start.edges.push(['ϵ', end]); - break; - case 'text': - start.edges.push([node.text, end]); - break; - case 'cat': - let last = start; - for (let i = 0; i < node.parts.length - 1; i += 1) { - const temp = { type: '', edges: [] }; - count = generateGraph(node.parts[i], last, temp, count); - last = temp; - } - count = generateGraph(node.parts[node.parts.length - 1], last, end, count); - break; - case 'or': - for (let i = 0; i < node.parts.length; i += 1) { - const tempStart = { type: '', edges: [] }; - const tempEnd = { - type: '', - edges: [['ϵ', end]], - }; - start.edges.push(['ϵ', tempStart]); - count = generateGraph(node.parts[i], tempStart, tempEnd, count); - } - break; - case 'star': - const tempStart = { type: '', edges: [] }; - const tempEnd = { - type: '', - edges: [['ϵ', tempStart], ['ϵ', end]], - }; - start.edges.push(['ϵ', tempStart]); - start.edges.push(['ϵ', end]); - count = generateGraph(node.sub, tempStart, tempEnd, count); - break; + 'use strict'; + function generateGraph(node, start, end, count) { + var i, last, temp, tempStart, tempEnd; + if (!start.hasOwnProperty('id')) { + start.id = count; + count += 1; + } + switch (node.type) { + case 'empty': + start.edges.push(['ϵ', end]); + break; + case 'text': + start.edges.push([node.text, end]); + break; + case 'cat': + last = start; + for (i = 0; i < node.parts.length - 1; i += 1) { + temp = { 'type': '', 'edges': [] }; + count = generateGraph(node.parts[i], last, temp, count); + last = temp; } - if (!('id' in end)) { - end.id = count; - count += 1; + count = generateGraph(node.parts[node.parts.length - 1], last, end, count); + break; + case 'or': + for (i = 0; i < node.parts.length; i += 1) { + tempStart = { 'type': '', 'edges': [] }; + tempEnd = { 'type': '', 'edges': [['ϵ', end]] }; + start.edges.push(['ϵ', tempStart]); + count = generateGraph(node.parts[i], tempStart, tempEnd, count); } - return count; + break; + case 'star': + tempStart = { 'type': '', 'edges': [] }; + tempEnd = { 'type': '', 'edges': [['ϵ', tempStart], ['ϵ', end]] }; + start.edges.push(['ϵ', tempStart]); + start.edges.push(['ϵ', end]); + count = generateGraph(node.sub, tempStart, tempEnd, count); + break; } - const ast = parseRegex(text); - const start = { type: '', edges: [] }; - const accept = { type: 'accept', edges: [] }; - if (typeof ast === 'string') { - return ast; + if (!end.hasOwnProperty('id')) { + end.id = count; + count += 1; } - generateGraph(ast, start, accept, 0); - return start; + return count; + } + var ast = parseRegex(text), + start = { 'type': 'start', 'edges': [] }, + accept = { 'type': 'accept', 'edges': [] }; + if (typeof ast === 'string') { + return ast; + } + generateGraph(ast, start, accept, 0); + return start; } /** * Convert nondeterministic finite automaton to deterministic finite automaton. @@ -327,108 +338,112 @@ function regexToNfa(text) { * @return {object} dfa Returns the first element of the DFA. */ function nfaToDfa(nfa) { - 'use strict'; - function getClosure(nodes) { - const closure = []; - const stack = []; - const symbols = []; - let type = ''; - let top; - for (const node of nodes) { - stack.push(node); - closure.push(node); - if (node.type === 'accept') { - type = 'accept'; - } - } - while (stack.length > 0) { - top = stack.pop(); - if (typeof top === 'string' && top.startsWith('Error')) { - continue; - } - for (const [edgeSymbol, edgeNode] of top.edges) { - if (edgeSymbol === 'ϵ') { - if (!closure.includes(edgeNode)) { - stack.push(edgeNode); - closure.push(edgeNode); - if (edgeNode.type === 'accept') { - type = 'accept'; - } - } - } - else { - if (!symbols.includes(edgeSymbol)) { - symbols.push(edgeSymbol); - } - } - } - } - closure.sort((a, b) => { - if (a.id && b.id) { - return a.id > b.id ? 1 : -1; - } - return 0; - }); - symbols.sort(); - return { - id: '', - key: closure.map((x) => x.id).join(','), - items: closure, - symbols: symbols, - type: type, - edges: [], - trans: {}, - nature: 0, - }; + 'use strict'; + function getClosure(nodes) { + var i, + closure = [], + stack = [], + symbols = [], + type = '', + top; + for (i = 0; i < nodes.length; i += 1) { + stack.push(nodes[i]); + closure.push(nodes[i]); + if (nodes[i].type === 'accept') { + type = 'accept'; + } } - function getClosedMove(closure, symbol) { - const nexts = []; - for (const node of closure.items) { - for (const [edgeSymbol, edgeNode] of node.edges) { - if (edgeSymbol === symbol && !nexts.includes(edgeNode)) { - nexts.push(edgeNode); - } + while (stack.length > 0) { + top = stack.pop(); + // If top is of type string and starts with "Error" then return error + if (typeof top === 'string' && top[0] === 'E') { + continue; + } + for (i = 0; i < top.edges.length; i += 1) { + if (top.edges[i][0] === 'ϵ') { + if (closure.indexOf(top.edges[i][1]) < 0) { + stack.push(top.edges[i][1]); + closure.push(top.edges[i][1]); + if (top.edges[i][1].type === 'accept') { + type = 'accept'; } + } + } else { + if (symbols.indexOf(top.edges[i][0]) < 0) { + symbols.push(top.edges[i][0]); + } } - return getClosure(nexts); + } } - function toAlphaCount(n) { - const a = 'A'.charCodeAt(0); - const z = 'Z'.charCodeAt(0); - const len = z - a + 1; - let s = ''; - while (n >= 0) { - s = String.fromCharCode(n % len + a) + s; - n = Math.floor(n / len) - 1; + closure.sort(function (a, b) { + return a.id - b.id; + }); + symbols.sort(); + return { + 'key': closure.map(function (x) { + return x.id; + }).join(','), + 'items': closure, + 'symbols': symbols, + 'type': type, + 'edges': [], + 'trans': {} + }; + } + function getClosedMove(closure, symbol) { + var i, + j, + node, + nexts = []; + for (i = 0; i < closure.items.length; i += 1) { + node = closure.items[i]; + for (j = 0; j < node.edges.length; j += 1) { + if (symbol === node.edges[j][0]) { + if (nexts.indexOf(node.edges[j][1]) < 0) { + nexts.push(node.edges[j][1]); + } } - return s; + } } - let i; - const first = getClosure([nfa]); - const states = {}; - let front = 0; - let top; - let closure; - const queue = [first]; - let count = 0; - first.id = toAlphaCount(count); - states[first.key] = first; - while (front < queue.length) { - top = queue[front]; - front += 1; - for (i = 0; i < top.symbols.length; i += 1) { - closure = getClosedMove(top, top.symbols[i]); - if (!(closure.key in states)) { - count += 1; - closure.id = toAlphaCount(count); - states[closure.key] = closure; - queue.push(closure); - } - top.trans[top.symbols[i]] = states[closure.key]; - top.edges.push([top.symbols[i], states[closure.key]]); - } + return getClosure(nexts); + } + function toAlphaCount(n) { + var a = 'A'.charCodeAt(0), + z = 'Z'.charCodeAt(0), + len = z - a + 1, + s = ''; + while (n >= 0) { + s = String.fromCharCode(n % len + a) + s; + n = Math.floor(n / len) - 1; + } + return s; + } + var i, + first = getClosure([nfa]), + states = {}, + front = 0, + top, + closure, + queue = [first], + count = 0; + first.id = toAlphaCount(count); + states[first.key] = first; + while (front < queue.length) { + top = queue[front]; + front += 1; + for (i = 0; i < top.symbols.length; i += 1) { + closure = getClosedMove(top, top.symbols[i]); + if (!states.hasOwnProperty(closure.key)) { + count += 1; + closure.id = toAlphaCount(count); + states[closure.key] = closure; + queue.push(closure); + } + top.trans[top.symbols[i]] = states[closure.key]; + top.edges.push([top.symbols[i], states[closure.key]]); } - return first; + } + return first; } /** * Convert the DFA to its minimum form using Hopcroft's algorithm. @@ -437,213 +452,201 @@ function nfaToDfa(nfa) { * @return {object} dfa Returns the first element of the minimum DFA. */ function minDfa(dfa) { - 'use strict'; - function getReverseEdges(start) { - const symbols = {}; // The input alphabet - const idMap = {}; // Map id to states - const revEdges = {}; // Map id to the ids which connects to the id with an alphabet; - const visited = {}; - visited[start.id] = true; - const queue = [start]; - let front = 0; - let top; - let symbol; - let next; - while (front < queue.length) { - top = queue[front]; - front += 1; - idMap[top.id] = top; - for (symbol of top.symbols) { - const symbolString = symbol; - if (!(symbolString in symbols)) { - symbols[symbolString] = true; - } - next = top.trans[symbolString]; - if (!(next.id in revEdges)) { - revEdges[next.id] = {}; - } - if (!(symbolString in revEdges[next.id])) { - revEdges[next.id][symbolString] = []; - } - revEdges[next.id][symbolString].push(top.id); - if (!(next.id in visited)) { - visited[next.id] = true; - queue.push(next); - } - } + 'use strict'; + function getReverseEdges(start) { + var i, top, symbol, next, + front = 0, + queue = [start], + visited = {}, + symbols = {}, // The input alphabet + idMap = {}, // Map id to states + revEdges = {}; // Map id to the ids which connects to the id with an alphabet + visited[start.id] = true; + while (front < queue.length) { + top = queue[front]; + front += 1; + idMap[top.id] = top; + for (i = 0; i < top.symbols.length; i += 1) { + symbol = top.symbols[i]; + if (!symbols.hasOwnProperty(symbol)) { + symbols[symbol] = true; } - return [Object.keys(symbols), idMap, revEdges]; - } - function hopcroft(symbols, idMap, revEdges) { - const ids = Object.keys(idMap).sort(); - const partitions = {}; - const queue = []; - const visited = {}; - let front = 0; - let top; - let i; - let j; - let k; - let keys; - let key; - let key1; - let key2; - let group1; - let group2; - let symbol; - let revGroup; - group1 = []; - group2 = []; - for (i = 0; i < ids.length; i += 1) { - if (idMap[ids[i]].type === 'accept') { - group1.push(ids[i]); - } - else { - group2.push(ids[i]); - } + next = top.trans[symbol]; + if (!revEdges.hasOwnProperty(next.id)) { + revEdges[next.id] = {}; } - key = group1.join(','); - partitions[key] = group1; - queue.push(key); - visited[key] = 0; - if (group2.length !== 0) { - key = group2.join(','); - partitions[key] = group2; - queue.push(key); + if (!revEdges[next.id].hasOwnProperty(symbol)) { + revEdges[next.id][symbol] = []; } - while (front < queue.length) { - top = queue[front]; - front += 1; - if (top !== null) { - top = top.split(','); - for (symbol of symbols) { - revGroup = {}; - for (j = 0; j < top.length; j += 1) { - if (revEdges[top[j]] && revEdges[top[j]][symbol]) { - for (k = 0; k < revEdges[top[j]][symbol].length; k += 1) { - revGroup[revEdges[top[j]][symbol][k]] = true; - } - } - } - keys = Object.keys(partitions); - for (key of keys) { - group1 = []; - group2 = []; - for (k = 0; k < partitions[key].length; k += 1) { - if (revGroup[partitions[key][k]]) { - group1.push(partitions[key][k]); - } - else { - group2.push(partitions[key][k]); - } - } - if (group1.length !== 0 && group2.length !== 0) { - delete partitions[key]; - key1 = group1.join(','); - key2 = group2.join(','); - partitions[key1] = group1; - partitions[key2] = group2; - if (visited[key1]) { - queue[visited[key1]] = null; - visited[key1] = queue.length; - queue.push(key1); - visited[key2] = queue.length; - queue.push(key2); - } - else if (group1.length <= group2.length) { - visited[key1] = queue.length; - queue.push(key1); - } - else { - visited[key2] = queue.length; - queue.push(key2); - } - } - } - } - } + revEdges[next.id][symbol].push(top.id); + if (!visited.hasOwnProperty(next.id)) { + visited[next.id] = true; + queue.push(next); } - return Object.values(partitions); + } + } + return [Object.keys(symbols), idMap, revEdges]; + } + function hopcroft(symbols, idMap, revEdges) { + var i, j, k, keys, key, key1, key2, top, group1, group2, symbol, revGroup, + ids = Object.keys(idMap).sort(), + partitions = {}, + front = 0, + queue = [], + visited = {}; + group1 = []; + group2 = []; + for (i = 0; i < ids.length; i += 1) { + if (idMap[ids[i]].type === 'accept') { + group1.push(ids[i]); + } else { + group2.push(ids[i]); + } + } + key = group1.join(','); + partitions[key] = group1; + queue.push(key); + visited[key] = 0; + if (group2.length !== 0) { + key = group2.join(','); + partitions[key] = group2; + queue.push(key); } - function buildMinNfa(start, partitions, idMap, revEdges) { - const nodes = []; - const group = {}; - const edges = {}; - partitions.sort((a, b) => { - const ka = a.join(','); - const kb = b.join(','); - if (ka < kb) { - return -1; + while (front < queue.length) { + top = queue[front]; + front += 1; + if (top) { + top = top.split(','); + for (i = 0; i < symbols.length; i += 1) { + symbol = symbols[i]; + revGroup = {}; + for (j = 0; j < top.length; j += 1) { + if (revEdges.hasOwnProperty(top[j]) && revEdges[top[j]].hasOwnProperty(symbol)) { + for (k = 0; k < revEdges[top[j]][symbol].length; k += 1) { + revGroup[revEdges[top[j]][symbol][k]] = true; + } } - if (ka > kb) { - return 1; + } + keys = Object.keys(partitions); + for (j = 0; j < keys.length; j += 1) { + key = keys[j]; + group1 = []; + group2 = []; + for (k = 0; k < partitions[key].length; k += 1) { + if (revGroup.hasOwnProperty(partitions[key][k])) { + group1.push(partitions[key][k]); + } else { + group2.push(partitions[key][k]); + } } - return 0; - }); - for (let i = 0; i < partitions.length; i += 1) { - if (partitions[i].indexOf(start.id.toString()) >= 0) { - if (i > 0) { - const temp = partitions[i]; - partitions[i] = partitions[0]; - partitions[0] = temp; - } - break; + if (group1.length !== 0 && group2.length !== 0) { + delete partitions[key]; + key1 = group1.join(','); + key2 = group2.join(','); + partitions[key1] = group1; + partitions[key2] = group2; + if (visited.hasOwnProperty(key1)) { + queue[visited[key1]] = null; + visited[key1] = queue.length; + queue.push(key1); + visited[key2] = queue.length; + queue.push(key2); + } else if (group1.length <= group2.length) { + visited[key1] = queue.length; + queue.push(key1); + } else { + visited[key2] = queue.length; + queue.push(key2); + } } + } } - for (let i = 0; i < partitions.length; i += 1) { - const node = { - id: (i + 1).toString(), - key: partitions[i].join(','), - items: [], - symbols: [], - type: idMap[partitions[i][0]].type, - edges: [], - trans: {}, - nature: 0, - }; - for (let j = 0; j < partitions[i].length; j += 1) { - node.items.push(idMap[partitions[i][j]]); - group[partitions[i][j]] = i; - } - edges[i] = {}; - nodes.push(node); + } + } + return Object.values(partitions); + } + function buildMinNfa(start, partitions, idMap, revEdges) { + var i, j, temp, node, symbol, + nodes = [], + group = {}, + edges = {}; + partitions.sort(function (a, b) { + var ka = a.join(','), kb = b.join(','); + if (ka < kb) { + return -1; + } + if (ka > kb) { + return 1; + } + return 0; + }); + for (i = 0; i < partitions.length; i += 1) { + if (partitions[i].indexOf(start.id) >= 0) { + if (i > 0) { + temp = partitions[i]; + partitions[i] = partitions[0]; + partitions[0] = temp; } - Object.keys(revEdges).forEach((to) => { - Object.keys(revEdges[to]).forEach((symbol) => { - revEdges[to][symbol].forEach((from) => { - if (!edges[group[from]].hasOwnProperty(group[to])) { - edges[group[from]][group[to]] = {}; - } - edges[group[from]][group[to]][symbol] = true; - }); - }); - }); - Object.keys(edges).forEach((from) => { - Object.keys(edges[Number(from)]).forEach((to) => { - const symbol = JSON.stringify(Object.keys(edges[Number(from)][Number(to)]).sort()); - nodes[parseInt(from)].symbols.push(symbol); - nodes[parseInt(from)].edges.push([symbol, nodes[parseInt(to)]]); - nodes[parseInt(from)].trans[symbol] = nodes[parseInt(to)]; - }); - }); - return nodes[0]; + break; + } + } + for (i = 0; i < partitions.length; i += 1) { + node = { + id: (i + 1).toString(), + key: partitions[i].join(','), + items: [], + symbols: [], + type: idMap[partitions[i][0]].type, + edges: [], + trans: {}, + }; + for (j = 0; j < partitions[i].length; j += 1) { + node.items.push(idMap[partitions[i][j]]); + group[partitions[i][j]] = i; + } + edges[i] = {}; + nodes.push(node); } - const [symbols, idMap, revEdges] = getReverseEdges(dfa); - const partitions = hopcroft(symbols, idMap, revEdges); - return buildMinNfa(dfa, partitions, idMap, revEdges); + Object.keys(revEdges).forEach(function (to) { + Object.keys(revEdges[to]).forEach(function (symbol) { + revEdges[to][symbol].forEach(function (from) { + if (!edges[group[from]].hasOwnProperty(group[to])) { + edges[group[from]][group[to]] = {}; + } + edges[group[from]][group[to]][symbol] = true; + }); + }); + }); + Object.keys(edges).forEach(function (from) { + Object.keys(edges[from]).forEach(function (to) { + symbol = JSON.stringify(Object.keys(edges[from][to]).sort()); + nodes[from].symbols.push(symbol); + nodes[from].edges.push([symbol, nodes[to]]); + nodes[from].trans[symbol] = nodes[to]; + }); + }); + return nodes[0]; + } + var edgesTuple = getReverseEdges(dfa), + symbols = edgesTuple[0], + idMap = edgesTuple[1], + revEdges = edgesTuple[2], + partitions = hopcroft(symbols, idMap, revEdges); + return buildMinNfa(dfa, partitions, idMap, revEdges); } function toNature(col) { - const base = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; - let result = 0; - if ('1' <= col[0] && col[0] <= '9') { - result = parseInt(col, 10); + var i, + j, + base = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', + result = 0; + if ('1' <= col[0] && col[0] <= '9') { + result = parseInt(col, 10); + } else { + for (i = 0, j = col.length - 1; i < col.length; i += 1, j -= 1) { + result += Math.pow(base.length, j) * (base.indexOf(col[i]) + 1); } - else { - for (let i = 0, j = col.length - 1; i < col.length; i += 1, j -= 1) { - result += Math.pow(base.length, j) * (base.indexOf(col[i]) + 1); - } - } - return result; + } + return result; } // '(\r\n|\x80)(to|from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>?\r\n'; // let regex = '(\r\n|\x80)(to|from):((a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9| |_|.|"|@|-)+<)?(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_|.|-)+@(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_|.|-)+>?\r\n'; @@ -653,73 +656,74 @@ function toNature(col) { // const base_64 = '(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|\\+|/|=)'; // const word_char = '(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_)'; function regexToDfa(regex) { - const nfa = regexToNfa(regex); - // console.log(nfa); - if (typeof nfa === 'string') { - return nfa; - } - const dfa = minDfa(nfaToDfa(nfa)); - // console.log(dfa); - const states = {}; - const nodes = []; - const stack = [dfa]; - const symbols = []; - while (stack.length > 0) { - const top = stack.pop(); - if (!states.hasOwnProperty(top.id.toString())) { - states[top.id.toString()] = top; - top.nature = toNature(top.id.toString()); - nodes.push(top); - for (const [symbol, node] of top.edges) { - if (symbol !== 'ϵ' && !symbols.includes(symbol)) { - symbols.push(symbol); - } - stack.push(node); - } + let nfa = regexToNfa(regex); + let dfa = minDfa(nfaToDfa(nfa)); + var i, + states = {}, + nodes = [], + stack = [dfa], + symbols = [], + top; + + while (stack.length > 0) { + top = stack.pop(); + if (!states.hasOwnProperty(top.id)) { + states[top.id] = top; + top.nature = toNature(top.id); + nodes.push(top); + for (i = 0; i < top.edges.length; i += 1) { + if (top.edges[i][0] !== 'ϵ' && symbols.indexOf(top.edges[i][0]) < 0) { + symbols.push(top.edges[i][0]); } + stack.push(top.edges[i][1]); + } } - nodes.sort((a, b) => a.nature - b.nature); - symbols.sort(); - const graph = []; - for (const node of nodes) { - const curr = {}; - curr.type = node.type; - curr.edges = {}; - for (const symbol of symbols) { - if (node.trans.hasOwnProperty(symbol)) { - curr.edges[symbol] = node.trans[symbol].nature - 1; - } - } - graph[node.nature - 1] = curr; + } + nodes.sort(function (a, b) { + return a.nature - b.nature; + }); + symbols.sort(); + let graph = []; + for (let i = 0; i < nodes.length; i += 1) { + let curr = {}; + curr.type = nodes[i].type; + curr.edges = {}; + for (let j = 0; j < symbols.length; j += 1) { + if (nodes[i].trans.hasOwnProperty(symbols[j])) { + curr.edges[symbols[j]] = nodes[i].trans[symbols[j]].nature - 1; + } } - // console.log(`graph: ${JSON.stringify(graph, null, 2)}`); - return JSON.stringify(graph); + graph[nodes[i].nature - 1] = curr; + } + // console.log(`graph: ${JSON.stringify(graph, null, 2)}`); + + return JSON.stringify(graph); } // function catchAllRegexStr() { -// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|\\[|\\\\|\\]|\\^|_|`|{|\\||}|~| |\t|\n|\r|\x0b|\x0c)"; +// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|\\[|\\\\|\\]|\\^|_|`|{|\\||}|~| |\t|\n|\r|\x0b|\x0c)"; // } // function catchAllWithoutRNRegexStr() { -// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|[|\\\\|]|^|_|`|{|\\||}|~| |\t|\x0b|\x0c)"; +// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|[|\\\\|]|^|_|`|{|\\||}|~| |\t|\x0b|\x0c)"; // } // function textContextPrefix() { -// return `Content-Type: text/plain; charset="UTF-8"\r\n\r\n`; +// return `Content-Type: text/plain; charset="UTF-8"\r\n\r\n`; // } // function formatRegexPrintable(s) { -// const escaped_string_json = JSON.stringify(s); -// const escaped_string = escaped_string_json.slice(1, escaped_string_json.length - 1); -// return escaped_string -// .replaceAll("\\\\\\\\", "\\") -// .replaceAll("\\\\", "\\") -// .replaceAll("/", "\\/") -// .replaceAll("\u000b", "\\♥") -// .replaceAll("^", "\\^") -// .replaceAll("$", "\\$") -// .replaceAll("|[|", "|\\[|") -// .replaceAll("|]|", "|\\]|") -// .replaceAll("|.|", "|\\.|") -// .replaceAll("|$|", "|\\$|") -// .replaceAll("|^|", "|\\^|"); +// const escaped_string_json = JSON.stringify(s); +// const escaped_string = escaped_string_json.slice(1, escaped_string_json.length - 1); +// return escaped_string +// .replaceAll("\\\\\\\\", "\\") +// .replaceAll("\\\\", "\\") +// .replaceAll("/", "\\/") +// .replaceAll("\u000b", "\\♥") +// .replaceAll("^", "\\^") +// .replaceAll("$", "\\$") +// .replaceAll("|[|", "|\\[|") +// .replaceAll("|]|", "|\\]|") +// .replaceAll("|.|", "|\\.|") +// .replaceAll("|$|", "|\\$|") +// .replaceAll("|^|", "|\\^|"); // } // module.exports = { -// regexToDfa +// regexToDfa // }; diff --git a/packages/compiler/src/regex.ts b/packages/compiler/src/regex.ts index 9961813..b107d7e 100644 --- a/packages/compiler/src/regex.ts +++ b/packages/compiler/src/regex.ts @@ -1,14 +1,120 @@ /* eslint-disable no-prototype-builtins */ /*jslint browser: true*/ -// const a2z_nosep = "abcdefghijklmnopqrstuvwxyz"; -// const A2Z_nosep = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; -// const a2f_nosep = "abcdef"; -// const A2F_nosep = "ABCDEF"; -// const r0to9_nosep = "0123456789"; -// const escapeMap = { n: "\n", r: "\r", t: "\t", v: "\v", f: "\f" }; -// const whitespace = Object.values(escapeMap); -// const slash_s = whitespace.join("|"); +const a2z_nosep = "abcdefghijklmnopqrstuvwxyz"; +const A2Z_nosep = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +const a2f_nosep = "abcdef"; +const A2F_nosep = "ABCDEF"; +const r0to9_nosep = "0123456789"; +const escapeMap = { n: "\n", r: "\r", t: "\t", v: "\v", f: "\f" }; +const whitespace = Object.values(escapeMap); +const slash_s = whitespace.join("|"); + +/** + * Parse regex to a min DFA spec + * to support some shorthands that make regex easier to write e.g. [A-Z] + */ +function regexToMinDFASpec(str: string): string { + // Replace all A-Z with A2Z etc + let combined_nosep = str + .replaceAll("A-Z", A2Z_nosep) + .replaceAll("a-z", a2z_nosep) + .replaceAll("A-F", A2F_nosep) + .replaceAll("a-f", a2f_nosep) + .replaceAll("0-9", r0to9_nosep) + .replaceAll("\\w", A2Z_nosep + r0to9_nosep + a2z_nosep + "_") + .replaceAll("\\d", r0to9_nosep) + .replaceAll("\\s", slash_s); + // .replaceAll("\\w", A2Z_nosep + r0to9_nosep + a2z_nosep); // I think that there's also an underscore here + + function addPipeInsideBrackets(str: string):string { + let result: string = ""; + let insideBrackets: boolean = false; + for (let i = 0; i < str.length; i++) { + if (str[i] === "[") { + result += str[i]; + insideBrackets = true; + continue; + } else if (str[i] === "]") { + insideBrackets = false; + } + let str_to_add = str[i]; + if (str[i] === "\\") { + i++; + str_to_add += str[i]; + } + result += insideBrackets ? "|" + str_to_add : str_to_add; + } + return result.replaceAll("[|", "[").replaceAll("[", "(").replaceAll("]", ")"); + } + + // function makeCurlyBracesFallback(str) { + // let result = ""; + // let insideBrackets = false; + // for (let i = 0; i < str.length; i++) { + // if (str[i] === "{") { + // result += str[i]; + // insideBrackets = true; + // continue; + // } else if (str[i] === "}") { + // insideBrackets = false; + // } + // result += insideBrackets ? "|" + str[i] : str[i]; + // } + // return result.replaceAll("[|", "[").replaceAll("[", "(").replaceAll("]", ")"); + // } + + function checkIfBracketsHavePipes(str: string): boolean { + let result: boolean = true; + let insideBrackets: boolean = false; + let insideParens: number = 0; + let indexAt: number = 0; + for (let i = 0; i < str.length; i++) { + if (indexAt >= str.length) break; + if (str[indexAt] === "[") { + insideBrackets = true; + indexAt++; + continue; + } else if (str[indexAt] === "]") { + insideBrackets = false; + } + if (str[indexAt] === "(") { + insideParens++; + } else if (str[indexAt] === ")") { + insideParens--; + } + if (insideBrackets) { + if (str[indexAt] === "|") { + indexAt++; + } else { + result = false; + return result; + } + } + if (!insideParens && str[indexAt] === "|") { + console.log("Error: | outside of parens!"); + } + if (str[indexAt] === "\\") { + indexAt++; + } + indexAt++; + } + return result; + } + + let combined; + if (!checkIfBracketsHavePipes(combined_nosep)) { + // console.log("Adding pipes within brackets between everything!"); + combined = addPipeInsideBrackets(combined_nosep); + if (!checkIfBracketsHavePipes(combined)) { + console.log("Did not add brackets correctly!"); + } + } else { + combined = combined_nosep; + } + + return combined; +} type CusNode = { type?: string; @@ -40,13 +146,14 @@ type DfaNode = { nature: number; }; + /** * Try parsing simple regular expression to syntax tree. * * Basic grammars: * Empty: S -> ϵ * Cat: S -> S S - * Or: S -> S | S + * Or: S -> S | S * Star: S -> S * * Text: S -> [0-9a-zA-Z] * S -> ( S ) @@ -57,250 +164,151 @@ type DfaNode = { * * @param {string} text The input regular expression * @return {string|object} Returns a string that is an error message if failed to parse the expression, - * otherwise returns an object which is the syntax tree. + * otherwise returns an object which is the syntax tree. */ function parseRegex(text: string): CusNode | string { - 'use strict'; - function parseSub(text: (string | [string])[], begin: number, end: number, first: boolean): CusNode | string { - var i: number, - sub: CusNode | string, - last: number = 0, - node: CusNode = { - begin: begin, - end: end, - }, - virNode: CusNode, - tempNode: CusNode, - stack: number = 0, - parts: CusNode[] = []; - if (text.length === 0) { - return 'Error: empty input at ' + begin + '.'; - } - if (first) { - for (i = 0; i <= text.length; i += 1) { - if (i === text.length || (text[i] === '|' && stack === 0)) { - if (last === 0 && i === text.length) { - return parseSub(text, begin + last, begin + i, false); - } - sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - if (typeof sub === 'string') { - return sub; - } - parts.push(sub); - last = i + 1; - } else if (text[i] === '(') { - stack += 1; - } else if (text[i] === ')') { - stack -= 1; - } - } - - if (parts.length === 1) { - return parts[0]; - } - node.type = 'or'; - node.parts = parts; - } else { - for (i = 0; i < text.length; i += 1) { - if (text[i] === '(') { - last = i + 1; - i += 1; - stack = 1; - while (i < text.length && stack !== 0) { - if (text[i] === '(') { - stack += 1; - } else if (text[i] === ')') { - stack -= 1; - } - i += 1; - } - if (stack !== 0) { - return `Error: missing right parentheses for ${begin + last}.`; - } - i -= 1; - sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - if (typeof sub === 'string') { - return sub; - } - sub.begin -= 1; - sub.end += 1; - parts.push(sub); - // } else if (text[i] === '[') { - // last = i + 1; - // i += 1; - // if (text[i] === '^') { - // text[i] = '\u{ff}'; - // } - // stack = 1; - // while (i < text.length && stack !== 0) { - // if (text[i] === ']') { - // stack -= 1; - // } - // i += 1; - // } - // if (stack !== 0) { - // return 'Error: missing right brakets for ' + (begin + last) + '.'; - // } - // i -= 1; - // sub = parseSub(text.slice(last, i), begin + last, begin + i, true); - // if (typeof sub === 'string') { - // return sub; - // } - // sub.begin -= 1; - // sub.end += 1; - // parts.push(sub); - } else if (text[i] === '*') { - if (parts.length === 0) { - return `Error: unexpected * at ${begin + i}.`; - } - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = 'star'; - tempNode.sub = parts[parts.length - 1]; - parts[parts.length - 1] = tempNode; - } else if (text[i] === '+') { - if (parts.length === 0) { - return `Error: unexpected + at ${begin + i}.`; - } - virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - virNode.type = 'star'; - virNode.sub = parts[parts.length - 1]; - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = 'cat'; - tempNode.parts = [parts[parts.length - 1], virNode]; - parts[parts.length - 1] = tempNode; - } else if (text[i] === '?') { - if (parts.length === 0) { - return `Error: unexpected ? at ${begin + i}.`; - } - virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - virNode.type = 'empty'; - virNode.sub = parts[parts.length - 1]; - tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; - tempNode.type = 'or'; - tempNode.parts = [parts[parts.length - 1], virNode]; - parts[parts.length - 1] = tempNode; - } else if (text[i] === 'ϵ') { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = 'empty'; - parts.push(tempNode); - } else if (Array.isArray(text[i])) { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = 'text'; - tempNode.text = text[i][0]; - parts.push(tempNode); - } else { - tempNode = { begin: begin + i, end: begin + i + 1 }; - tempNode.type = 'text'; - tempNode.text = text[i]; - parts.push(tempNode); - } - } - // console.log(`parts ${JSON.stringify(parts)}`); - if (parts.length === 1) { - return parts[0]; - } - node.type = 'cat'; - node.parts = parts; - } - return node; + text = regexToMinDFASpec(text); + "use strict"; + function parseSub(text: (string | [string])[], begin: number, end: number, first: boolean): CusNode | string { + var i: number, + sub: CusNode | string, + last: number = 0, + node: CusNode = { + begin: begin, + end: end, + }, + virNode: CusNode, + tempNode: CusNode, + stack: number = 0, + parts: CusNode[] = []; + if (text.length === 0) { + return "Error: empty input at " + begin + "."; } - - let char: string | [string]; - let new_text: (string | [string])[] = []; - let i: number = 0; - let is_in_brancket: boolean = false; - let brancket_text: (string | [string])[] = []; - while (i < text.length) { - char = text[i]; - - if (text[i] == '\\') { - char = [text[i + 1]]; - // new_text.push([text[i + 1]]); - i += 1; - } - - if (char === '[') { - if (is_in_brancket) { - return `Error: unexpected [ at ${i}.`; - } - is_in_brancket = true; - brancket_text = []; - // new_text.push(char); - i += 1; - } else if (char === ']') { - if (!is_in_brancket) { - return `Error: unexpected ] at ${i}.`; - } - is_in_brancket = false; - - if (brancket_text[0] === '^') { - brancket_text.shift(); - let rev_text: (string | [string])[] = []; - let code_char: string | [string] = ''; - const brancket_text_jsons = brancket_text.map(val => JSON.stringify(val)); - for (let idx = 0; idx < 255; idx++) { - code_char = String.fromCodePoint(idx); - - if ([ - '(', - ')', - '*', - '+', - '.', - '?', - '[', - '\\', - ']', - '^', - '`', - '|', - '-' - ].indexOf(code_char) != -1) { - code_char = [code_char]; - } - - if (brancket_text_jsons.indexOf(JSON.stringify(code_char)) === -1) { - rev_text.push(code_char); - } - } - - brancket_text = rev_text; - } - - new_text.push('('); - - for (const c of brancket_text) { - new_text.push(c); - new_text.push('|'); - } - - new_text = new_text.slice(0, -1); - new_text.push(')'); - i += 1; - } else if (is_in_brancket) { - if (!Array.isArray(char) && ['(', ')', '[', '*', '+', '?', 'ϵ'].includes(char)) { - return `Error: unexpected ${char} at ${i}.`; - } - - if (char === '^' && text[i - 1] !== '[') { - return `Error: unexpected ^ at ${i}.`; - } - // new_text.push(char); - // new_text.push('|'); - brancket_text.push(char); - i += 1; - } else { - new_text.push(char); - i += 1; + if (first) { + for (i = 0; i <= text.length; i += 1) { + if (i === text.length || (text[i] === "|" && stack === 0)) { + if (last === 0 && i === text.length) { + return parseSub(text, begin + last, begin + i, false); + } + sub = parseSub(text.slice(last, i), begin + last, begin + i, true); + if (typeof sub === "string") { + return sub; + } + parts.push(sub); + last = i + 1; + } else if (text[i] === "(") { + stack += 1; + } else if (text[i] === ")") { + stack -= 1; + } + } + if (parts.length === 1) { + return parts[0]; + } + node.type = "or"; + node.parts = parts; + } else { + for (i = 0; i < text.length; i += 1) { + if (text[i] === "(") { + last = i + 1; + i += 1; + stack = 1; + while (i < text.length && stack !== 0) { + if (text[i] === "(") { + stack += 1; + } else if (text[i] === ")") { + stack -= 1; } + i += 1; + } + if (stack !== 0) { + return "Error: missing right bracket for " + (begin + last) + "."; + } + i -= 1; + sub = parseSub(text.slice(last, i), begin + last, begin + i, true); + if (typeof sub === "string") { + return sub; + } + sub.begin -= 1; + sub.end += 1; + parts.push(sub); + } else if (text[i] === "*") { + if (parts.length === 0) { + return "Error: unexpected * at " + (begin + i) + "."; + } + tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + tempNode.type = "star"; + tempNode.sub = parts[parts.length - 1]; + parts[parts.length - 1] = tempNode; + } else if (text[i] === "+") { + if (parts.length === 0) { + return "Error: unexpected + at " + (begin + i) + "."; + } + virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + virNode.type = "star"; + virNode.sub = parts[parts.length - 1]; + tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + tempNode.type = "cat"; + tempNode.parts = [parts[parts.length - 1], virNode]; + parts[parts.length - 1] = tempNode; + } else if (text[i] === "?") { + if (parts.length === 0) { + return "Error: unexpected + at " + (begin + i) + "."; + } + virNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + virNode.type = "empty"; + virNode.sub = parts[parts.length - 1]; + tempNode = { begin: parts[parts.length - 1].begin, end: parts[parts.length - 1].end + 1 }; + tempNode.type = "or"; + tempNode.parts = [parts[parts.length - 1], virNode]; + parts[parts.length - 1] = tempNode; + } else if (text[i] === "ϵ") { + tempNode = { begin: begin + i, end: begin + i + 1 }; + tempNode.type = "empty"; + parts.push(tempNode); + } else if (Array.isArray(text[i])) { + tempNode = { begin: begin + i, end: begin + i + 1 }; + tempNode.type = "text"; + tempNode.text = text[i][0]; + parts.push(tempNode); + } else { + tempNode = { begin: begin + i, end: begin + i + 1 }; + tempNode.type = "text"; + tempNode.text = text[i]; + parts.push(tempNode); + } } - - if (is_in_brancket) { - return `Error: missing right brackets.`; + if (parts.length === 1) { + return parts[0]; } - - return parseSub(new_text, 0, new_text.length, true); -} + node.type = "cat"; + node.parts = parts; + } + return node; + } + + let new_text: string[] = []; + let i = 0; + while (i < text.length) { + if (text[i] === "\\") { + const escapeMap = new Map([ + ["n", "\n"], + ["r", "\r"], + ["t", "\t"], + ["v", "\v"], + ["f", "\f"], + ["^", String.fromCharCode(128)], + ]); + const char: string = text[i + 1]; + new_text.push(escapeMap.get(char) ?? char); + i += 2; + } else { + new_text.push(text[i]); + i += 1; + } + } + return parseSub(new_text, 0, new_text.length, true); + } /** @@ -310,74 +318,59 @@ function parseRegex(text: string): CusNode | string { * @return {object|string} */ function regexToNfa(text: string): NfaNode | string { - 'use strict'; - function generateGraph(node: CusNode, start: NfaNode, end: NfaNode, count: number): number { - if ('id' in start) { - start.id = count; - count += 1; - } - - switch (node.type) { - case 'empty': - start.edges.push(['ϵ', end]); - break; - case 'text': - start.edges.push([node.text!, end]); - break; - case 'cat': - let last = start; - for (let i = 0; i < node.parts!.length - 1; i += 1) { - const temp: NfaNode = { type: '', edges: [] }; - count = generateGraph(node.parts![i], last, temp, count); - last = temp; - } - count = generateGraph( - node.parts![node.parts!.length - 1], - last, - end, - count - ); - break; - case 'or': - for (let i = 0; i < node.parts!.length; i += 1) { - const tempStart: NfaNode = { type: '', edges: [] }; - const tempEnd: NfaNode = { - type: '', - edges: [['ϵ', end]], - }; - start.edges.push(['ϵ', tempStart]); - count = generateGraph(node.parts![i], tempStart, tempEnd, count); - } - break; - case 'star': - const tempStart: NfaNode = { type: '', edges: [] }; - const tempEnd: NfaNode = { - type: '', - edges: [['ϵ', tempStart], ['ϵ', end]], - }; - start.edges.push(['ϵ', tempStart]); - start.edges.push(['ϵ', end]); - count = generateGraph(node.sub!, tempStart, tempEnd, count); - break; + 'use strict'; + function generateGraph(node: CusNode, start: NfaNode, end: NfaNode, count: number): number { + var i: number, last: NfaNode, temp: NfaNode, tempStart: NfaNode, tempEnd: NfaNode; + if (!start.hasOwnProperty('id')) { + start.id = count; + count += 1; + } + switch (node.type) { + case 'empty': + start.edges.push(['ϵ', end]); + break; + case 'text': + start.edges.push([node.text!, end]); + break; + case 'cat': + last = start; + for (i = 0; i < node.parts!.length - 1; i += 1) { + temp = { 'type': '', 'edges': [] }; + count = generateGraph(node.parts![i], last, temp, count); + last = temp; } - - if (!('id' in end)) { - end.id = count; - count += 1; + count = generateGraph(node.parts![node.parts!.length - 1], last, end, count); + break; + case 'or': + for (i = 0; i < node.parts!.length; i += 1) { + tempStart = { 'type': '', 'edges': [] }; + tempEnd = { 'type': '', 'edges': [['ϵ', end]] }; + start.edges.push(['ϵ', tempStart]); + count = generateGraph(node.parts![i], tempStart, tempEnd, count); } - - return count; + break; + case 'star': + tempStart = { 'type': '', 'edges': [] }; + tempEnd = { 'type': '', 'edges': [['ϵ', tempStart], ['ϵ', end]] }; + start.edges.push(['ϵ', tempStart]); + start.edges.push(['ϵ', end]); + count = generateGraph(node.sub!, tempStart, tempEnd, count); + break; } - const ast: string | CusNode = parseRegex(text); - const start: NfaNode = { type: '', edges: [] }; - const accept: NfaNode = { type: 'accept', edges: [] }; - - if (typeof ast === 'string') { - return ast; + if (!end.hasOwnProperty('id')) { + end.id = count; + count += 1; } - - generateGraph(ast, start, accept, 0); - return start; + return count; + } + var ast = parseRegex(text), + start = { 'type': 'start', 'edges': [] }, + accept = { 'type': 'accept', 'edges': [] }; + if (typeof ast === 'string') { + return ast; + } + generateGraph(ast, start, accept, 0); + return start; } /** @@ -387,124 +380,117 @@ function regexToNfa(text: string): NfaNode | string { * @return {object} dfa Returns the first element of the DFA. */ function nfaToDfa(nfa: NfaNode): DfaNode { - 'use strict'; - function getClosure(nodes: NfaNode[]): DfaNode { - const closure: NfaNode[] = []; - const stack: NfaNode[] = []; - const symbols: (string | [string])[] = []; - let type = ''; - let top: NfaNode | string; - - for (const node of nodes) { - stack.push(node); - closure.push(node); - if (node.type === 'accept') { - type = 'accept'; - } - } - - while (stack.length > 0) { - top = stack.pop()!; - if (typeof top === 'string' && (top as string).startsWith('Error')) { - continue; - } - for (const [edgeSymbol, edgeNode] of top.edges) { - if (edgeSymbol === 'ϵ') { - if (!closure.includes(edgeNode)) { - stack.push(edgeNode); - closure.push(edgeNode); - if (edgeNode.type === 'accept') { - type = 'accept'; - } - } - } else { - if (!symbols.includes(edgeSymbol)) { - symbols.push(edgeSymbol); - } - } + 'use strict'; + function getClosure(nodes: NfaNode[]): DfaNode { + var i: number, + closure: NfaNode[] = [], + stack: NfaNode[] = [], + symbols: (string | [string])[] = [], + type: string = '', + top: NfaNode; + for (i = 0; i < nodes.length; i += 1) { + stack.push(nodes[i]); + closure.push(nodes[i]); + if (nodes[i].type === 'accept') { + type = 'accept'; + } + } + while (stack.length > 0) { + top = stack.pop()!; + // If top is of type string and starts with "Error" then return error + if (typeof top === 'string' && top[0] === 'E' && !top) { + continue; + } + for (i = 0; i < top.edges.length; i += 1) { + if (top.edges[i][0] === 'ϵ') { + if (closure.indexOf(top.edges[i][1]) < 0) { + stack.push(top.edges[i][1]); + closure.push(top.edges[i][1]); + if (top.edges[i][1].type === 'accept') { + type = 'accept'; } + } + } else { + if (symbols.indexOf(top.edges[i][0]) < 0) { + symbols.push(top.edges[i][0]); + } } - - closure.sort((a, b) => { + } + } + closure.sort((a, b) => { if (a.id && b.id) { return a.id > b.id ? 1 : -1; } return 0; }); - - symbols.sort(); - - return { - id: '', - key: closure.map((x) => x.id).join(','), - items: closure, - symbols: symbols, - type: type, - edges: [], - trans: {}, - nature: 0, - }; - } - - function getClosedMove(closure: DfaNode, symbol: string | [string]): DfaNode { - const nexts: NfaNode[] = []; - - for (const node of closure.items) { - for (const [edgeSymbol, edgeNode] of node.edges) { - if (edgeSymbol === symbol && !nexts.includes(edgeNode)) { - nexts.push(edgeNode); - } - } + symbols.sort(); + return { + id: '', + 'key': closure.map(function (x) { + return x.id; + }).join(','), + 'items': closure, + 'symbols': symbols, + 'type': type, + 'edges': [], + 'trans': {}, + 'nature': 0, + }; + } + function getClosedMove(closure: DfaNode, symbol: string | [string]): DfaNode { + var i, + j, + node, + nexts = []; + for (i = 0; i < closure.items.length; i += 1) { + node = closure.items[i]; + for (j = 0; j < node.edges.length; j += 1) { + if (symbol === node.edges[j][0]) { + if (nexts.indexOf(node.edges[j][1]) < 0) { + nexts.push(node.edges[j][1]); + } } - - return getClosure(nexts); + } } - - function toAlphaCount(n: number): string { - const a = 'A'.charCodeAt(0); - const z = 'Z'.charCodeAt(0); - const len = z - a + 1; - let s = ''; - - while (n >= 0) { - s = String.fromCharCode(n % len + a) + s; - n = Math.floor(n / len) - 1; - } - - return s; + return getClosure(nexts); + } + function toAlphaCount(n: number): string { + var a = 'A'.charCodeAt(0), + z = 'Z'.charCodeAt(0), + len = z - a + 1, + s = ''; + while (n >= 0) { + s = String.fromCharCode(n % len + a) + s; + n = Math.floor(n / len) - 1; } - - let i: number; - const first: DfaNode = getClosure([nfa]); - const states: Record = {}; - let front = 0; - let top: DfaNode; - let closure: DfaNode; - const queue: DfaNode[] = [first]; - let count = 0; - first.id = toAlphaCount(count); - states[first.key] = first; - - while (front < queue.length) { - top = queue[front]; - front += 1; - - for (i = 0; i < top.symbols.length; i += 1) { - closure = getClosedMove(top, top.symbols[i]); - - if (!(closure.key in states)) { - count += 1; - closure.id = toAlphaCount(count); - states[closure.key] = closure; - queue.push(closure); - } - - top.trans[top.symbols[i] as string] = states[closure.key]; - top.edges.push([top.symbols[i], states[closure.key]]); - } + return s; + } + var i: number, + first: DfaNode = getClosure([nfa]), + states: Record = {}, + front: number = 0, + top: DfaNode, + closure: DfaNode, + queue: DfaNode[] = [first], + count: number = 0; + first.id = toAlphaCount(count); + states[first.key] = first; + while (front < queue.length) { + top = queue[front]; + front += 1; + for (i = 0; i < top.symbols.length; i += 1) { + closure = getClosedMove(top, top.symbols[i]); + if (!states.hasOwnProperty(closure.key)) { + count += 1; + closure.id = toAlphaCount(count); + states[closure.key] = closure; + queue.push(closure); + } + top.trans[top.symbols[i] as string] = states[closure.key]; + top.edges.push([top.symbols[i], states[closure.key]]); } - - return first; + } + return first; } /** @@ -514,245 +500,216 @@ function nfaToDfa(nfa: NfaNode): DfaNode { * @return {object} dfa Returns the first element of the minimum DFA. */ function minDfa(dfa: DfaNode) { - 'use strict'; - function getReverseEdges(start: DfaNode): [string[], Record, Record>] { - const symbols: Record = {}; // The input alphabet - const idMap: Record = {}; // Map id to states - const revEdges: Record> = {} // Map id to the ids which connects to the id with an alphabet; - const visited: Record = {}; - visited[start.id] = true; - - const queue: DfaNode[] = [start]; - let front = 0; - let top: DfaNode; - let symbol: string | [string]; - let next: DfaNode; - - while (front < queue.length) { - top = queue[front]; - front += 1; - idMap[top.id] = top; - - for (symbol of top.symbols) { - const symbolString = symbol as string; - if (!(symbolString in symbols)) { - symbols[symbolString] = true; - } - - next = top.trans[symbolString]; - - if (!(next.id in revEdges)) { - revEdges[next.id] = {}; - } - - if (!(symbolString in revEdges[next.id])) { - revEdges[next.id][symbolString] = []; - } - - revEdges[next.id][symbolString].push(top.id); - - if (!(next.id in visited)) { - visited[next.id] = true; - queue.push(next); - } - } + 'use strict'; + function getReverseEdges(start: DfaNode): [string[], Record, Record>] { + var i: number, top: DfaNode, symbol: string | [string], next: DfaNode, + front: number = 0, + queue: DfaNode[] = [start], + visited: Record = {}, + symbols: Record = {}, // The input alphabet + idMap: Record = {}, // Map id to states + revEdges: Record> = {}; // Map id to the ids which connects to the id with an alphabet + visited[start.id] = true; + while (front < queue.length) { + top = queue[front]; + front += 1; + idMap[top.id] = top; + for (i = 0; i < top.symbols.length; i += 1) { + symbol = top.symbols[i]; + if (!symbols.hasOwnProperty(symbol as string)) { + symbols[symbol as string] = true; } - - return [Object.keys(symbols), idMap, revEdges]; - } - - function hopcroft(symbols: string[], idMap: Record, revEdges: Record>): string[][] { - const ids = Object.keys(idMap).sort(); - const partitions: Record = {}; - const queue: (string | null)[] = []; - const visited: Record = {}; - - let front = 0; - let top: string[] | string | null; - let i: number; - let j: number; - let k: number; - let keys: string[]; - let key: string; - let key1: string; - let key2: string; - let group1: string[]; - let group2: string[]; - let symbol: string; - let revGroup: Record; - - group1 = []; - group2 = []; - - for (i = 0; i < ids.length; i += 1) { - if (idMap[ids[i]].type === 'accept') { - group1.push(ids[i]); - } else { - group2.push(ids[i]); - } + next = top.trans[symbol as string]; + if (!revEdges.hasOwnProperty(next.id)) { + revEdges[next.id] = {}; } - - key = group1.join(','); - partitions[key] = group1; - queue.push(key); - visited[key] = 0; - - if (group2.length !== 0) { - key = group2.join(','); - partitions[key] = group2; - queue.push(key); + if (!revEdges[next.id].hasOwnProperty(symbol as string)) { + revEdges[next.id][symbol as string] = []; } - - while (front < queue.length) { - top = queue[front]; - front += 1; - - if (top !== null) { - top = top.split(','); - - for (symbol of symbols) { - revGroup = {}; - - for (j = 0; j < top.length; j += 1) { - if (revEdges[top[j]] && revEdges[top[j]][symbol]) { - for (k = 0; k < revEdges[top[j]][symbol].length; k += 1) { - revGroup[revEdges[top[j]][symbol][k]] = true; - } - } - } - - keys = Object.keys(partitions); - - for (key of keys) { - group1 = []; - group2 = []; - - for (k = 0; k < partitions[key].length; k += 1) { - if (revGroup[partitions[key][k]]) { - group1.push(partitions[key][k]); - } else { - group2.push(partitions[key][k]); - } - } - - if (group1.length !== 0 && group2.length !== 0) { - delete partitions[key]; - key1 = group1.join(','); - key2 = group2.join(','); - partitions[key1] = group1; - partitions[key2] = group2; - - if (visited[key1]) { - queue[visited[key1]] = null; - visited[key1] = queue.length; - queue.push(key1); - visited[key2] = queue.length; - queue.push(key2); - } else if (group1.length <= group2.length) { - visited[key1] = queue.length; - queue.push(key1); - } else { - visited[key2] = queue.length; - queue.push(key2); - } - } - } - } - } + revEdges[next.id][symbol as string].push(top.id); + if (!visited.hasOwnProperty(next.id)) { + visited[next.id] = true; + queue.push(next); } + } + } + return [Object.keys(symbols), idMap, revEdges]; + } + function hopcroft(symbols: string[], idMap: Record, revEdges: Record>): string[][] { + const ids = Object.keys(idMap).sort(); + const partitions: Record = {}; + const queue: (string | null)[] = []; + const visited: Record = {}; - return Object.values(partitions); + let front = 0; + let top: string[] | string | null; + let i: number; + let j: number; + let k: number; + let keys: string[]; + let key: string; + let key1: string; + let key2: string; + let group1: string[]; + let group2: string[]; + let symbol: string; + let revGroup: Record; + + group1 = []; + group2 = []; + for (i = 0; i < ids.length; i += 1) { + if (idMap[ids[i]].type === 'accept') { + group1.push(ids[i]); + } else { + group2.push(ids[i]); + } + } + key = group1.join(','); + partitions[key] = group1; + queue.push(key); + visited[key] = 0; + if (group2.length !== 0) { + key = group2.join(','); + partitions[key] = group2; + queue.push(key); } - function buildMinNfa(start: DfaNode, partitions: string[][], idMap: Record, revEdges: Record>): DfaNode { - const nodes: DfaNode[] = []; - const group: Record = {}; - const edges: Record>> = {}; - - partitions.sort((a, b) => { - const ka = a.join(','); - const kb = b.join(','); - if (ka < kb) { - return -1; + while (front < queue.length) { + top = queue[front]; + front += 1; + if (top) { + top = top.split(','); + for (i = 0; i < symbols.length; i += 1) { + symbol = symbols[i]; + revGroup = {}; + for (j = 0; j < top.length; j += 1) { + if (revEdges.hasOwnProperty(top[j]) && revEdges[top[j]].hasOwnProperty(symbol)) { + for (k = 0; k < revEdges[top[j]][symbol].length; k += 1) { + revGroup[revEdges[top[j]][symbol][k]] = true; + } } - if (ka > kb) { - return 1; + } + keys = Object.keys(partitions); + for (j = 0; j < keys.length; j += 1) { + key = keys[j]; + group1 = []; + group2 = []; + for (k = 0; k < partitions[key].length; k += 1) { + if (revGroup.hasOwnProperty(partitions[key][k])) { + group1.push(partitions[key][k]); + } else { + group2.push(partitions[key][k]); + } } - return 0; - }); - - for (let i = 0; i < partitions.length; i += 1) { - if (partitions[i].indexOf(start.id.toString()) >= 0) { - if (i > 0) { - const temp = partitions[i]; - partitions[i] = partitions[0]; - partitions[0] = temp; - } - break; + if (group1.length !== 0 && group2.length !== 0) { + delete partitions[key]; + key1 = group1.join(','); + key2 = group2.join(','); + partitions[key1] = group1; + partitions[key2] = group2; + if (visited.hasOwnProperty(key1)) { + queue[visited[key1]] = null; + visited[key1] = queue.length; + queue.push(key1); + visited[key2] = queue.length; + queue.push(key2); + } else if (group1.length <= group2.length) { + visited[key1] = queue.length; + queue.push(key1); + } else { + visited[key2] = queue.length; + queue.push(key2); + } } + } } - - for (let i = 0; i < partitions.length; i += 1) { - const node: DfaNode = { - id: (i + 1).toString(), - key: partitions[i].join(','), - items: [], - symbols: [], - type: idMap[partitions[i][0]].type, - edges: [], - trans: {}, - nature: 0, - }; - - for (let j = 0; j < partitions[i].length; j += 1) { - node.items.push(idMap[partitions[i][j]]); - group[partitions[i][j]] = i; - } - - edges[i] = {}; - nodes.push(node); + } + } + return Object.values(partitions); + } + function buildMinNfa(start: DfaNode, partitions: string[][], idMap: Record, revEdges: Record>): DfaNode { + var i: number, j: number, temp: string[], node, symbol; + const nodes: DfaNode[] = []; + const group: Record = {}; + const edges: Record>> = {}; + partitions.sort(function (a, b) { + var ka = a.join(','), kb = b.join(','); + if (ka < kb) { + return -1; + } + if (ka > kb) { + return 1; + } + return 0; + }); + for (i = 0; i < partitions.length; i += 1) { + if (partitions[i].indexOf(start.id.toString()) >= 0) { + if (i > 0) { + temp = partitions[i]; + partitions[i] = partitions[0]; + partitions[0] = temp; } - - Object.keys(revEdges).forEach((to) => { - Object.keys(revEdges[to]).forEach((symbol) => { - revEdges[to][symbol].forEach((from) => { - if (!edges[group[from]].hasOwnProperty(group[to])) { - edges[group[from]][group[to]] = {}; - } - edges[group[from]][group[to]][symbol] = true; - }); - }); + break; + } + } + for (i = 0; i < partitions.length; i += 1) { + const node: DfaNode = { + id: (i + 1).toString(), + key: partitions[i].join(','), + items: [], + symbols: [], + type: idMap[partitions[i][0]].type, + edges: [], + trans: {}, + nature: 0, + }; + for (j = 0; j < partitions[i].length; j += 1) { + node.items.push(idMap[partitions[i][j]]); + group[partitions[i][j]] = i; + } + edges[i] = {}; + nodes.push(node); + } + Object.keys(revEdges).forEach(function (to) { + Object.keys(revEdges[to]).forEach(function (symbol) { + revEdges[to][symbol].forEach(function (from) { + if (!edges[group[from]].hasOwnProperty(group[to])) { + edges[group[from]][group[to]] = {}; + } + edges[group[from]][group[to]][symbol] = true; }); - - Object.keys(edges).forEach((from) => { - Object.keys(edges[Number(from)]).forEach((to) => { - const symbol = JSON.stringify(Object.keys(edges[Number(from)][Number(to)]).sort()); - nodes[parseInt(from)].symbols.push(symbol); - nodes[parseInt(from)].edges.push([symbol, nodes[parseInt(to)]]); - nodes[parseInt(from)].trans[symbol] = nodes[parseInt(to)]; - }); + }); + }); + Object.keys(edges).forEach((from) => { + Object.keys(edges[Number(from)]).forEach((to) => { + const symbol = JSON.stringify(Object.keys(edges[Number(from)][Number(to)]).sort()); + nodes[parseInt(from)].symbols.push(symbol); + nodes[parseInt(from)].edges.push([symbol, nodes[parseInt(to)]]); + nodes[parseInt(from)].trans[symbol] = nodes[parseInt(to)]; }); - - return nodes[0]; - } - - const [symbols, idMap, revEdges] = getReverseEdges(dfa); - const partitions = hopcroft(symbols, idMap, revEdges); - return buildMinNfa(dfa, partitions, idMap, revEdges); + }); + return nodes[0]; + } + var edgesTuple = getReverseEdges(dfa), + symbols = edgesTuple[0], + idMap = edgesTuple[1], + revEdges = edgesTuple[2], + partitions = hopcroft(symbols, idMap, revEdges); + return buildMinNfa(dfa, partitions, idMap, revEdges); } function toNature(col: string): number { - const base = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; - let result = 0; - - if ('1' <= col[0] && col[0] <= '9') { - result = parseInt(col, 10); - } else { - for (let i = 0, j = col.length - 1; i < col.length; i += 1, j -= 1) { - result += Math.pow(base.length, j) * (base.indexOf(col[i]) + 1); - } + var i, + j, + base = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', + result = 0; + if ('1' <= col[0] && col[0] <= '9') { + result = parseInt(col, 10); + } else { + for (i = 0, j = col.length - 1; i < col.length; i += 1, j -= 1) { + result += Math.pow(base.length, j) * (base.indexOf(col[i]) + 1); } - - return result; + } + return result; } // '(\r\n|\x80)(to|from):([A-Za-z0-9 _."@-]+<)?[a-zA-Z0-9_.-]+@[a-zA-Z0-9_.]+>?\r\n'; @@ -766,84 +723,78 @@ function toNature(col: string): number { function regexToDfa(regex: string): string { - const nfa = regexToNfa(regex); - // console.log(nfa); - if (typeof nfa === 'string') { - return nfa; - } - - const dfa = minDfa(nfaToDfa(nfa)); - // console.log(dfa); - const states: Record = {}; - const nodes: DfaNode[] = []; - const stack: DfaNode[] = [dfa]; - const symbols: string[] = []; - - while (stack.length > 0) { - const top = stack.pop()!; - if (!states.hasOwnProperty(top.id.toString())) { - states[top.id.toString()] = top; - top.nature = toNature(top.id.toString()); - nodes.push(top); - for (const [symbol, node] of top.edges) { - if (symbol !== 'ϵ' && !symbols.includes(symbol as string)) { - symbols.push(symbol as string); - } - stack.push(node); - } + let nfa = regexToNfa(regex); + let dfa = minDfa(nfaToDfa(nfa as NfaNode)); + var i: number, + states: Record = {}, + nodes: DfaNode[] = [], + stack = [dfa], + symbols: string[] = []; + + while (stack.length > 0) { + const top = stack.pop()!; + if (!states.hasOwnProperty(top.id.toString())) { + states[top.id] = top; + top.nature = toNature(top.id.toString()); + nodes.push(top); + for (i = 0; i < top.edges.length; i += 1) { + if (top.edges[i][0] !== 'ϵ' && symbols.indexOf(top.edges[i][0] as string) < 0) { + symbols.push(top.edges[i][0] as string); } + stack.push(top.edges[i][1]); + } } - - nodes.sort((a, b) => a.nature - b.nature); - symbols.sort(); - - const graph: Record[] = []; - - for (const node of nodes) { - const curr: Record = {}; - curr.type = node.type; - curr.edges = {}; - for (const symbol of symbols) { - if (node.trans.hasOwnProperty(symbol)) { - curr.edges[symbol] = node.trans[symbol].nature - 1; - } - } - graph[node.nature - 1] = curr; + } + nodes.sort(function (a, b) { + return a.nature - b.nature; + }); + symbols.sort(); + const graph: Record[] = []; + for (let i = 0; i < nodes.length; i += 1) { + const curr: Record = {}; + curr.type = nodes[i].type; + curr.edges = {}; + for (let j = 0; j < symbols.length; j += 1) { + if (nodes[i].trans.hasOwnProperty(symbols[j])) { + curr.edges[symbols[j]] = nodes[i].trans[symbols[j]].nature - 1; + } } - // console.log(`graph: ${JSON.stringify(graph, null, 2)}`); + graph[nodes[i].nature - 1] = curr; + } + // console.log(`graph: ${JSON.stringify(graph, null, 2)}`); - return JSON.stringify(graph); + return JSON.stringify(graph); } // function catchAllRegexStr() { -// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|\\[|\\\\|\\]|\\^|_|`|{|\\||}|~| |\t|\n|\r|\x0b|\x0c)"; +// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|\\[|\\\\|\\]|\\^|_|`|{|\\||}|~| |\t|\n|\r|\x0b|\x0c)"; // } // function catchAllWithoutRNRegexStr() { -// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|[|\\\\|]|^|_|`|{|\\||}|~| |\t|\x0b|\x0c)"; +// return "(0|1|2|3|4|5|6|7|8|9|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|!|\"|#|$|%|&|'|\\(|\\)|\\*|\\+|,|-|.|/|:|;|<|=|>|\\?|@|[|\\\\|]|^|_|`|{|\\||}|~| |\t|\x0b|\x0c)"; // } // function textContextPrefix() { -// return `Content-Type: text/plain; charset="UTF-8"\r\n\r\n`; +// return `Content-Type: text/plain; charset="UTF-8"\r\n\r\n`; // } // function formatRegexPrintable(s) { -// const escaped_string_json = JSON.stringify(s); -// const escaped_string = escaped_string_json.slice(1, escaped_string_json.length - 1); -// return escaped_string -// .replaceAll("\\\\\\\\", "\\") -// .replaceAll("\\\\", "\\") -// .replaceAll("/", "\\/") -// .replaceAll("\u000b", "\\♥") -// .replaceAll("^", "\\^") -// .replaceAll("$", "\\$") -// .replaceAll("|[|", "|\\[|") -// .replaceAll("|]|", "|\\]|") -// .replaceAll("|.|", "|\\.|") -// .replaceAll("|$|", "|\\$|") -// .replaceAll("|^|", "|\\^|"); +// const escaped_string_json = JSON.stringify(s); +// const escaped_string = escaped_string_json.slice(1, escaped_string_json.length - 1); +// return escaped_string +// .replaceAll("\\\\\\\\", "\\") +// .replaceAll("\\\\", "\\") +// .replaceAll("/", "\\/") +// .replaceAll("\u000b", "\\♥") +// .replaceAll("^", "\\^") +// .replaceAll("$", "\\$") +// .replaceAll("|[|", "|\\[|") +// .replaceAll("|]|", "|\\]|") +// .replaceAll("|.|", "|\\.|") +// .replaceAll("|$|", "|\\$|") +// .replaceAll("|^|", "|\\^|"); // } // module.exports = { -// regexToDfa +// regexToDfa // }; diff --git a/packages/compiler/src/tests/mod.rs b/packages/compiler/src/tests/mod.rs new file mode 100644 index 0000000..0497cce --- /dev/null +++ b/packages/compiler/src/tests/mod.rs @@ -0,0 +1 @@ +mod regex_to_dfa; \ No newline at end of file diff --git a/packages/compiler/src/tests/regex_to_dfa.rs b/packages/compiler/src/tests/regex_to_dfa.rs new file mode 100644 index 0000000..95ddf8b --- /dev/null +++ b/packages/compiler/src/tests/regex_to_dfa.rs @@ -0,0 +1,86 @@ +use crate::js_caller::{JsCallerError, regex_to_dfa}; + +#[test] +fn test_regex_to_dfa_case_1() { + let regex = "[a-z]+"; + let dfa = regex_to_dfa(regex).unwrap(); + assert_eq!(serde_json::to_string_pretty(&dfa).unwrap(), r#"[ + { + "type": "", + "edges": { + "[\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 1 + } + }, + { + "type": "accept", + "edges": { + "[\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 1 + } + } +]"#); +} + +#[test] +fn test_regex_to_dfa_case_2() { + let regex = "[a-z0-9]+"; + let dfa = regex_to_dfa(regex).unwrap(); + assert_eq!(serde_json::to_string_pretty(&dfa).unwrap(), r#"[ + { + "type": "", + "edges": { + "[\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 1 + } + }, + { + "type": "accept", + "edges": { + "[\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 1 + } + } +]"#); +} + +#[test] +fn test_regex_to_dfa_case_3() { + + let regex = "[a-z0-9]+@[a-z0-9]+\r\n"; + let dfa = regex_to_dfa(regex).unwrap(); + assert_eq!(serde_json::to_string_pretty(&dfa).unwrap(), r#"[ + { + "type": "", + "edges": { + "[\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 1 + } + }, + { + "type": "", + "edges": { + "[\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 1, + "[\"@\"]": 2 + } + }, + { + "type": "", + "edges": { + "[\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 3 + } + }, + { + "type": "", + "edges": { + "[\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"a\",\"b\",\"c\",\"d\",\"e\",\"f\",\"g\",\"h\",\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]": 3, + "[\"\\r\"]": 4 + } + }, + { + "type": "", + "edges": { + "[\"\\n\"]": 5 + } + }, + { + "type": "accept", + "edges": {} + } +]"#); +} \ No newline at end of file diff --git a/packages/compiler/tsconfig.json b/packages/compiler/tsconfig.json index 7db0f2c..515784b 100644 --- a/packages/compiler/tsconfig.json +++ b/packages/compiler/tsconfig.json @@ -9,7 +9,7 @@ // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ /* Language and Environment */ - "target": "ES2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + "target": "ES2021", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ // "jsx": "preserve", /* Specify what JSX code is generated. */ // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */