Skip to content

Commit

Permalink
Improve auto-atomicity for conditionals (#63299)
Browse files Browse the repository at this point in the history
- Allow for automatically making loops that end conditional branches be atomic
- Allow for using a following expression conditional to make a prior loop atomic
- Allow conditionals to influence the computed minimum length
  • Loading branch information
stephentoub authored Jan 19, 2022
1 parent 8fe0467 commit 31f20c5
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -441,11 +441,12 @@ internal RegexNode FinalOptimize()
/// </remarks>
private void EliminateEndingBacktracking()
{
Debug.Assert((Options & RegexOptions.NonBacktracking) == 0, "NonBacktracking doesn't have backtracking to be eliminated and doesn't support atomic groups.");

if (!StackHelper.TryEnsureSufficientExecutionStack())
if (!StackHelper.TryEnsureSufficientExecutionStack() ||
(Options & (RegexOptions.RightToLeft | RegexOptions.NonBacktracking)) != 0)
{
// If we can't recur further, just stop optimizing.
// We haven't done the work to validate this is correct for RTL.
// And NonBacktracking doesn't support atomic groups and doesn't have backtracking to be eliminated.
return;
}

Expand All @@ -470,15 +471,15 @@ private void EliminateEndingBacktracking()
continue;

// For Capture and Concatenate, we just recur into their last child (only child in the case
// of Capture). However, if the child is Alternate, Loop, and Lazyloop, we can also make the
// of Capture). However, if the child is an alternation or loop, we can also make the
// node itself atomic by wrapping it in an Atomic node. Since we later check to see whether a
// node is atomic based on its parent or grandparent, we don't bother wrapping such a node in
// an Atomic one if its grandparent is already Atomic.
// e.g. [xyz](?:abc|def) => [xyz](?>abc|def)
case Capture:
case Concatenate:
RegexNode existingChild = node.Child(node.ChildCount() - 1);
if ((existingChild.Type is Alternate or Loop or Lazyloop) &&
if ((existingChild.Type is Alternate or Testref or Testgroup or Loop or Lazyloop) &&
(node.Next is null || node.Next.Type != Atomic)) // validate grandparent isn't atomic
{
var atomic = new RegexNode(Atomic, existingChild.Options);
Expand All @@ -489,17 +490,25 @@ private void EliminateEndingBacktracking()
continue;

// For alternate, we can recur into each branch separately. We use this iteration for the first branch.
// Conditionals are just like alternations in this regard.
// e.g. abc*|def* => ab(?>c*)|de(?>f*)
case Alternate:
case Testref:
case Testgroup:
{
int branches = node.ChildCount();
for (int i = 1; i < branches; i++)
{
node.Child(i).EliminateEndingBacktracking();
}

if (node.Type != Testgroup) // ReduceTestgroup will have already applied ending backtracking removal
{
node = node.Child(0);
continue;
}
}
node = node.Child(0);
continue;
break;

// For {Lazy}Loop, we search to see if there's a viable last expression, and iff there
// is we recur into processing it. Also, as with the single-char lazy loops, LazyLoop
Expand Down Expand Up @@ -1793,15 +1802,19 @@ static void ProcessNode(RegexNode node, RegexNode subsequent)
node.MakeLoopAtomic();
break;
case Alternate:
case Testref:
case Testgroup:
// In the case of alternation, we can't change the alternation node itself
// based on what comes after it (at least not with more complicated analysis
// that factors in all branches together), but we can look at each individual
// branch, and analyze ending loops in each branch individually to see if they
// can be made atomic. Then if we do end up backtracking into the alternation,
// we at least won't need to backtrack into that loop.
// we at least won't need to backtrack into that loop. The same is true for
// conditionals, though we don't want to process the condition expression
// itself, as it's already considered atomic and handled as part of ReduceTestgroup.
{
int alternateBranches = node.ChildCount();
for (int b = 0; b < alternateBranches; b++)
for (int b = node.Type == Testgroup ? 1 : 0; b < alternateBranches; b++)
{
ProcessNode(node.Child(b), subsequent);
}
Expand Down Expand Up @@ -1858,6 +1871,11 @@ private RegexNode ReduceRequire()
Debug.Assert(Type == Require);
Debug.Assert(ChildCount() == 1);

// A positive lookaround is a zero-width atomic assertion.
// As it's atomic, nothing will backtrack into it, and we can
// eliminate any ending backtracking from it.
EliminateEndingBacktracking();

// A positive lookaround wrapped around an empty is a nop, and can just
// be made into an empty. A developer typically doesn't write this, but
// rather it evolves due to optimizations resulting in empty.
Expand Down Expand Up @@ -1932,6 +1950,11 @@ private RegexNode ReduceTestgroup()
ReplaceChild(0, condition.Child(0));
}

// We can also eliminate any ending backtracking in the condition, as the condition
// is considered to be a positive lookahead, which is an atomic zero-width assertion.
condition = Child(0);
condition.EliminateEndingBacktracking();

return this;
}

Expand All @@ -1954,7 +1977,8 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool a
while (true)
{
// Skip the successor down to the closest node that's guaranteed to follow it.
while (subsequent.ChildCount() > 0)
int childCount;
while ((childCount = subsequent.ChildCount()) > 0)
{
Debug.Assert(subsequent.Type != Group);
switch (subsequent.Type)
Expand All @@ -1972,25 +1996,30 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool a
}

// If the two nodes don't agree on options in any way, don't try to optimize them.
// TODO: Remove this once https://github.com/dotnet/runtime/issues/61048 is implemented.
if (node.Options != subsequent.Options)
{
return false;
}

// If the successor is an alternation, all of its children need to be evaluated, since any of them
// could come after this node. If any of them fail the optimization, then the whole node fails.
if (subsequent.Type == Alternate)
// This applies to expression conditionals as well, as long as they have both a yes and a no branch (if there's
// only a yes branch, we'd need to also check whatever comes after the conditional). It doesn't apply to
// backreference conditionals, as the condition itself is unknown statically and could overlap with the
// loop being considered for atomicity.
switch (subsequent.Type)
{
int childCount = subsequent.ChildCount();
for (int i = 0; i < childCount; i++)
{
if (!CanBeMadeAtomic(node, subsequent.Child(i), allowSubsequentIteration))
case Alternate:
case Testgroup when childCount == 3: // condition, yes, and no branch
for (int i = 0; i < childCount; i++)
{
return false;
if (!CanBeMadeAtomic(node, subsequent.Child(i), allowSubsequentIteration))
{
return false;
}
}
}

return true;
return true;
}

// If this node is a {one/notone/set}loop, see if it overlaps with its successor in the concatenation.
Expand Down Expand Up @@ -2183,6 +2212,14 @@ public int ComputeMinLength()
return min;
}

case Testref:
// Minimum of its yes and no branches. The backreference doesn't add to the length.
return Math.Min(Child(0).ComputeMinLength(), Child(1).ComputeMinLength());

case Testgroup:
// Minimum of its yes and no branches. The condition is a zero-width assertion.
return Math.Min(Child(1).ComputeMinLength(), Child(2).ComputeMinLength());

case Concatenate:
// The sum of all of the concatenation's children.
{
Expand Down Expand Up @@ -2225,8 +2262,6 @@ public int ComputeMinLength()
// a different structure, as they can't be added as part of a concatenation, since they overlap
// with what comes after.
case Ref:
case Testgroup:
case Testref:
// Constructs requiring data at runtime from the matching pattern can't influence min length.
return 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,18 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("(?(a)a)", "(?(a)a|)")]
[InlineData("(?(abc)def)", "(?(abc)def|)")]
[InlineData("(?(\\w)\\d)", "(?(\\w)\\d|)")]
// Loops inside alternation constructs
[InlineData("(abc*|def)ghi", "(ab(?>c*)|def)ghi")]
[InlineData("(abc|def*)ghi", "(abc|de(?>f*))ghi")]
[InlineData("(abc*|def*)ghi", "(ab(?>c*)|de(?>f*))ghi")]
[InlineData("(abc*|def*)", "(ab(?>c*)|de(?>f*))")]
[InlineData("(?(\\w)abc*|def*)ghi", "(?(\\w)ab(?>c*)|de(?>f*))ghi")]
[InlineData("(?(\\w)abc*|def*)", "(?(\\w)ab(?>c*)|de(?>f*))")]
[InlineData("(?(xyz*)abc|def)", "(?(xy(?>z*))abc|def)")]
[InlineData("(?(xyz*)abc|def)\\w", "(?(xy(?>z*))abc|def)\\w")]
// Loops followed by alternation constructs
[InlineData("a*(bcd|efg)", "(?>a*)(bcd|efg)")]
[InlineData("a*(?(xyz)bcd|efg)", "(?>a*)(?(xyz)bcd|efg)")]
// Auto-atomicity
[InlineData("a*b", "(?>a*)b")]
[InlineData("a*b+", "(?>a*)(?>b+)")]
Expand Down Expand Up @@ -499,6 +511,18 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
[InlineData("(w+)+", "((?>w+))+")]
[InlineData("(w{1,2})+", "((?>w{1,2}))+")]
[InlineData("(?:ab|cd|ae)f", "(?>ab|cd|ae)f")]
// Loops inside alternation constructs
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]
[InlineData("(abc*|def*)\\whi", "(ab(?>c*)|de(?>f*))\\whi")]
[InlineData("(?(\\w)abc*|def*)\\whi", "(?(\\w)ab(?>c*)|de(?>f*))\\whi")]
// Loops followed by alternation constructs
[InlineData("a*(bcd|afg)", "(?>a*)(bcd|afg)")]
[InlineData("(a*)(?(1)bcd|efg)", "((?>a*))(?(1)bcd|efg)")]
[InlineData("a*(?(abc)bcd|efg)", "(?>a*)(?(abc)bcd|efg)")]
[InlineData("a*(?(xyz)acd|efg)", "(?>a*)(?(xyz)acd|efg)")]
[InlineData("a*(?(xyz)bcd|afg)", "(?>a*)(?(xyz)bcd|afg)")]
[InlineData("a*(?(xyz)bcd)", "(?>a*)(?(xyz)bcd)")]
public void PatternsReduceDifferently(string pattern1, string pattern2)
{
string result1 = GetRegexCodes(new Regex(pattern1));
Expand Down Expand Up @@ -545,6 +569,20 @@ public void PatternsReduceDifferently(string pattern1, string pattern2)
[InlineData(@"(YZ+|(WX+|(UV+|(ST+|(QR+|(OP+|(MN+|(KL+|(IJ+|(GH+|(EF+|(CD+|(AB+|(89+|(67+|(45+|(23+|(01+|(yz+|(wx+|(uv+|(st+|(qr+|(op+|(mn+|(kl+|(ij+|(gh+|(ef+|(de+|(a|bc+)))))))))))))))))))))))))))))))", 1)]
[InlineData(@"a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(a(ab|cd+)|ef+)|gh+)|ij+)|kl+)|mn+)|op+)|qr+)|st+)|uv+)|wx+)|yz+)|01+)|23+)|45+)|67+)|89+)|AB+)|CD+)|EF+)|GH+)|IJ+)|KL+)|MN+)|OP+)|QR+)|ST+)|UV+)|WX+)|YZ+)", 3)]
[InlineData(@"(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))", 1)]
[InlineData(@"(?(\d)\d{3}|\d)", 1)]
[InlineData(@"(?(\d{7})\d{3}|\d{2})", 2)]
[InlineData(@"(?(\d{7})\d{2}|\d{3})", 2)]
[InlineData(@"(?(\d)\d{3}|\d{2})", 2)]
[InlineData(@"(?(\d)|\d{2})", 0)]
[InlineData(@"(?(\d)\d{3})", 0)]
[InlineData(@"(abc)(?(1)\d{3}|\d{2})", 5)]
[InlineData(@"(abc)(?(1)\d{2}|\d{3})", 5)]
[InlineData(@"(abc)(?(1)|\d{2})", 3)]
[InlineData(@"(abc)(?(1)\d{3})", 3)]
[InlineData(@"(abc|)", 0)]
[InlineData(@"(|abc)", 0)]
[InlineData(@"(?(x)abc|)", 0)]
[InlineData(@"(?(x)|abc)", 0)]
public void MinRequiredLengthIsCorrect(string pattern, int expectedLength)
{
var r = new Regex(pattern);
Expand Down

0 comments on commit 31f20c5

Please sign in to comment.