diff --git a/java/com/google/re2j/Compiler.java b/java/com/google/re2j/Compiler.java index 2fc4b82e..ce056dfa 100644 --- a/java/com/google/re2j/Compiler.java +++ b/java/com/google/re2j/Compiler.java @@ -24,6 +24,7 @@ class Compiler { private static class Frag { final int i; // an instruction address (pc). int out; // a patch list; see explanation in Prog.java + boolean nullable; // whether the fragment can match the empty string Frag() { this(0, 0); @@ -34,8 +35,13 @@ private static class Frag { } Frag(int i, int out) { + this(i, out, false); + } + + Frag(int i, int out, boolean nullable) { this.i = i; this.out = out; + this.nullable = nullable; } } @@ -56,7 +62,7 @@ static Prog compileRegexp(Regexp re) { private Frag newInst(int op) { // TODO(rsc): impose length limit. prog.addInst(op); - return new Frag(prog.numInst() - 1); + return new Frag(prog.numInst() - 1, 0, true); } // Returns a no-op fragment. Sometimes unavoidable. @@ -90,7 +96,7 @@ private Frag cat(Frag f1, Frag f2) { } // TODO(rsc): elide nop prog.patch(f1.out, f2.i); - return new Frag(f1.i, f2.out); + return new Frag(f1.i, f2.out, f1.nullable && f2.nullable); } // Given fragments for a and b, returns fragment for a|b. @@ -107,11 +113,16 @@ private Frag alt(Frag f1, Frag f2) { i.out = f1.i; i.arg = f2.i; f.out = prog.append(f1.out, f2.out); + f.nullable = f1.nullable || f2.nullable; return f; } - // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) - private Frag quest(Frag f1, boolean nongreedy) { + // loop returns the fragment for the main loop of a plus or star. + // For plus, it can be used directly. with f1.i as the entry. + // For star, it can be used directly when f1 can't match an empty string. + // (When f1 can match an empty string, f1* must be implemented as (f1+)? + // to get the priority match order correct.) + private Frag loop(Frag f1, boolean nongreedy) { Frag f = newInst(Inst.ALT); Inst i = prog.getInst(f.i); if (nongreedy) { @@ -121,12 +132,12 @@ private Frag quest(Frag f1, boolean nongreedy) { i.out = f1.i; f.out = f.i << 1 | 1; } - f.out = prog.append(f.out, f1.out); + prog.patch(f1.out, f.i); return f; } - // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) - private Frag star(Frag f1, boolean nongreedy) { + // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) + private Frag quest(Frag f1, boolean nongreedy) { Frag f = newInst(Inst.ALT); Inst i = prog.getInst(f.i); if (nongreedy) { @@ -136,13 +147,21 @@ private Frag star(Frag f1, boolean nongreedy) { i.out = f1.i; f.out = f.i << 1 | 1; } - prog.patch(f1.out, f.i); + f.out = prog.append(f.out, f1.out); return f; } + // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) + private Frag star(Frag f1, boolean nongreedy) { + if (f1.nullable) { + return quest(plus(f1, nongreedy), nongreedy); + } + return loop(f1, nongreedy); + } + // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) private Frag plus(Frag f1, boolean nongreedy) { - return new Frag(f1.i, star(f1, nongreedy).out); + return new Frag(f1.i, loop(f1, nongreedy).out, f1.nullable); } // op is a bitmask of EMPTY_* flags. @@ -160,6 +179,7 @@ private Frag rune(int rune, int flags) { // flags : parser flags private Frag rune(int[] runes, int flags) { Frag f = newInst(Inst.RUNE); + f.nullable = false; Inst i = prog.getInst(f.i); i.runes = runes; flags &= RE2.FOLD_CASE; // only relevant flag is FoldCase diff --git a/javatests/com/google/re2j/FindTest.java b/javatests/com/google/re2j/FindTest.java index 58b83d3e..12a983a9 100644 --- a/javatests/com/google/re2j/FindTest.java +++ b/javatests/com/google/re2j/FindTest.java @@ -249,6 +249,7 @@ public String toString() { 35, 35, 36), + new Test("(|a)*", "aa", 3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2), }; @Parameters @@ -538,7 +539,14 @@ private void testSubmatch(String testName, Test test, int n, String[] result) { System.err.println(testName + " " + test + " " + n + " " + k + " "); String expect = test.submatchString(n, k / 2); if (!expect.equals(result[k / 2])) { - fail(String.format("%s %d: expected %s got %s: %s", testName, n, expect, result, test)); + fail( + String.format( + "%s %d: expected %s got %s: %s", + testName, + n, + expect, + Arrays.toString(result), + test)); } } } diff --git a/javatests/com/google/re2j/ProgTest.java b/javatests/com/google/re2j/ProgTest.java index da70615f..4abc3ecf 100644 --- a/javatests/com/google/re2j/ProgTest.java +++ b/javatests/com/google/re2j/ProgTest.java @@ -94,6 +94,25 @@ public class ProgTest { "(?:(?:^).)", "0 fail\n" + "1* empty 4 -> 2\n" + "2 anynotnl -> 3\n" + "3 match\n" }, + { + "(?:|a)+", + "0 fail\n" + + "1 nop -> 4\n" + + "2 rune1 \"a\" -> 4\n" + + "3* alt -> 1, 2\n" + + "4 alt -> 3, 5\n" + + "5 match\n" + }, + { + "(?:|a)*", + "0 fail\n" + + "1 nop -> 4\n" + + "2 rune1 \"a\" -> 4\n" + + "3 alt -> 1, 2\n" + + "4 alt -> 3, 6\n" + + "5* alt -> 3, 6\n" + + "6 match\n" + }, }; private final String input; diff --git a/testdata/basic.dat b/testdata/basic.dat index 7859290b..1776b1ff 100644 --- a/testdata/basic.dat +++ b/testdata/basic.dat @@ -124,24 +124,20 @@ E ((a)) abc (0,1)(0,1)(0,1) E (a)b(c) abc (0,3)(0,1)(2,3) E a+b+c aabbabc (4,7) E a* aaa (0,3) -#E (a*)* - (0,0)(0,0) -E (a*)* - (0,0)(?,?) RE2/Go +E (a*)* - (0,0)(0,0) E (a*)+ - (0,0)(0,0) -#E (a*|b)* - (0,0)(0,0) -E (a*|b)* - (0,0)(?,?) RE2/Go +E (a*|b)* - (0,0)(0,0) E (a+|b)* ab (0,2)(1,2) E (a+|b)+ ab (0,2)(1,2) E (a+|b)? ab (0,1)(0,1) BE [^ab]* cde (0,3) -#E (^)* - (0,0)(0,0) -E (^)* - (0,0)(?,?) RE2/Go +E (^)* - (0,0)(0,0) BE a* NULL (0,0) E ([abc])*d abbbcd (0,6)(4,5) E ([abc])*bcd abcd (0,4)(0,1) E a|b|c|d|e e (0,1) E (a|b|c|d|e)f ef (0,2)(0,1) -#E ((a*|b))* - (0,0)(0,0)(0,0) -E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +E ((a*|b))* - (0,0)(0,0)(0,0) BE abcd*efg abcdefg (0,7) BE ab* xabyabbbz (1,3) BE ab* xayabbbz (1,2) diff --git a/testdata/nullsubexpr.dat b/testdata/nullsubexpr.dat index 2e18fbb9..68d9c999 100644 --- a/testdata/nullsubexpr.dat +++ b/testdata/nullsubexpr.dat @@ -1,8 +1,7 @@ NOTE null subexpression matches : 2002-06-06 E (a*)* a (0,1)(0,1) -#E SAME x (0,0)(0,0) -E SAME x (0,0)(?,?) RE2/Go +E SAME x (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E (a*)+ a (0,1)(0,1) @@ -19,8 +18,7 @@ E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([a]*)* a (0,1)(0,1) -#E SAME x (0,0)(0,0) -E SAME x (0,0)(?,?) RE2/Go +E SAME x (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([a]*)+ a (0,1)(0,1) @@ -28,8 +26,7 @@ E SAME x (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([^b]*)* a (0,1)(0,1) -#E SAME b (0,0)(0,0) -E SAME b (0,0)(?,?) RE2/Go +E SAME b (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaab (0,6)(0,6) E ([ab]*)* a (0,1)(0,1) @@ -41,11 +38,9 @@ E SAME bbbbbb (0,6)(0,6) E SAME aaaabcde (0,5)(0,5) E ([^a]*)* b (0,1)(0,1) E SAME bbbbbb (0,6)(0,6) -#E SAME aaaaaa (0,0)(0,0) -E SAME aaaaaa (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,0)(0,0) E ([^ab]*)* ccccxx (0,6)(0,6) -#E SAME ababab (0,0)(0,0) -E SAME ababab (0,0)(?,?) RE2/Go +E SAME ababab (0,0)(0,0) E ((z)+|a)* zabcde (0,2)(1,2) @@ -65,8 +60,7 @@ B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) -#E (a*)*(x) x (0,1)(0,0)(0,1) -E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) x (0,1)(0,0)(0,1) E (a*)*(x) ax (0,2)(0,1)(1,2) E (a*)*(x) axa (0,2)(0,1)(1,2) diff --git a/testdata/re2-exhaustive.txt.gz b/testdata/re2-exhaustive.txt.gz index c6eebf00..4482caf9 100644 Binary files a/testdata/re2-exhaustive.txt.gz and b/testdata/re2-exhaustive.txt.gz differ diff --git a/testdata/re2-search.txt b/testdata/re2-search.txt index f648e552..8c4098a4 100644 --- a/testdata/re2-search.txt +++ b/testdata/re2-search.txt @@ -1,5 +1,5 @@ # RE2 basic search tests built by make log -# Thu Sep 8 13:43:43 EDT 2011 +# Wed May 12 12:13:22 EDT 2021 Regexp.SearchTests strings "" @@ -227,22 +227,6 @@ regexps 0-0;0-0;0-0;0-0 strings "" -"" -regexps -"a*" -0-0;0-0;0-0;0-0 -0-0;0-0;0-0;0-0 -"^(?:a*)$" -0-0;0-0;0-0;0-0 -0-0;0-0;0-0;0-0 -"^(?:a*)" -0-0;0-0;0-0;0-0 -0-0;0-0;0-0;0-0 -"(?:a*)$" -0-0;0-0;0-0;0-0 -0-0;0-0;0-0;0-0 -strings -"" "xabcdx" regexps "ab|cd" @@ -3651,6 +3635,86 @@ regexps 0-1;0-1;0-1;0-1 strings "" +"a" +regexps +"a\\C+" +-;-;-;- +-;-;-;- +"^(?:a\\C+)$" +-;-;-;- +-;-;-;- +"^(?:a\\C+)" +-;-;-;- +-;-;-;- +"(?:a\\C+)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"a\\C?" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C?)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C*?" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*?)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C*?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C+?" +-;-;-;- +-;-;-;- +"^(?:a\\C+?)$" +-;-;-;- +-;-;-;- +"^(?:a\\C+?)" +-;-;-;- +-;-;-;- +"(?:a\\C+?)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"a\\C??" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C??)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C??)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C??)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" "baba" regexps "a\\C*|ba\\C" @@ -3665,3 +3729,51 @@ regexps "(?:a\\C*|ba\\C)$" -;-;-;- -;1-4;-;1-4 +strings +"" +"Inc." +regexps +"\\w*I\\w*" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\w*I\\w*)$" +-;-;-;- +-;-;-;- +"^(?:\\w*I\\w*)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\w*I\\w*)$" +-;-;-;- +-;-;-;- +strings +"" +"aaa" +regexps +"(?:|a)*" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"^(?:(?:|a)*)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +"^(?:(?:|a)*)" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"(?:(?:|a)*)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +strings +"" +"aaa" +regexps +"(?:|a)+" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"^(?:(?:|a)+)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +"^(?:(?:|a)+)" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"(?:(?:|a)+)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3