From b28af20c980f5dc1be811cf89d0480cb8144c720 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 8 Mar 2022 12:06:57 -0500 Subject: [PATCH 1/2] Use StartsWith(..., OrdinalIgnoreCase) in RegexCompiler / source generator When we encounter a sequence of sets representing case-insensitive ASCII, we can simplify the code generated to just call StartsWith, which both makes it more readable but also takes advantage of the new JIT optimization to lower that into efficient vectorized comparisons based on the supplied literal. This also cleans up some formatting in the source generator emitted code to make things much more concise and less noisy. --- .../gen/RegexGenerator.Emitter.cs | 179 +++++++++++------- .../Text/RegularExpressions/RegexCompiler.cs | 91 ++++++--- .../Text/RegularExpressions/RegexNode.cs | 80 ++++++++ 3 files changed, 247 insertions(+), 103 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 0220a7782290ea..88812c21606119 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1021,6 +1021,9 @@ void CaseGoto(string clause, string label) } // Whether the node has RegexOptions.IgnoreCase set. + // TODO: https://github.com/dotnet/runtime/issues/61048. We should be able to delete this and all usage sites once + // IgnoreCase is erradicated from the tree. The only place it should possibly be left after that work is in a Backreference, + // and that can do this check directly as needed. static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0; // Slices the inputSpan starting at pos until end and stores it into slice. @@ -1127,7 +1130,7 @@ void EmitAlternation(RegexNode node) // If it's not a One, Multi, or Set, we can't apply this optimization. // If it's IgnoreCase (and wasn't reduced to a non-IgnoreCase set), also ignore it to keep the logic simple. if (node.Child(i).FindBranchOneMultiOrSetStart() is not RegexNode oneMultiOrSet || - (oneMultiOrSet.Options & RegexOptions.IgnoreCase) != 0) // TODO: https://github.com/dotnet/runtime/issues/61048 + IsCaseInsensitive(oneMultiOrSet)) { useSwitchedBranches = false; break; @@ -1208,7 +1211,7 @@ void EmitSwitchedBranches() RegexNode? childStart = child.FindBranchOneMultiOrSetStart(); Debug.Assert(childStart is not null, "Unexpectedly couldn't find the branch starting node."); - Debug.Assert((childStart.Options & RegexOptions.IgnoreCase) == 0, "Expected only to find non-IgnoreCase branch starts"); + Debug.Assert(!IsCaseInsensitive(childStart), "Expected only to find non-IgnoreCase branch starts"); if (childStart.Kind is RegexNodeKind.Set) { @@ -2051,14 +2054,15 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck TransferSliceStaticPosToPos(); } - // Separate out several node types that, for conciseness, don't need a header and scope written into the source. + // Separate out several node types that, for conciseness, don't need a header nor scope written into the source. + // Effectively these either evaporate, are completely self-explanatory, or only exist for their children to be rendered. switch (node.Kind) { - // Nothing is written for an empty + // Nothing is written for an empty. case RegexNodeKind.Empty: return; - // A match failure doesn't need a scope. + // A single-line goto for a failure doesn't need a scope or comment. case RegexNodeKind.Nothing: Goto(doneLabel); return; @@ -2075,79 +2079,95 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck return; } - // Put the node's code into its own scope. If the node contains labels that may need to - // be visible outside of its scope, the scope is still emitted for clarity but is commented out. - using (EmitScope(writer, DescribeNode(node, analysis), faux: analysis.MayBacktrack(node))) + // For everything else, output a comment about what the node is. + writer.WriteLine($"// {DescribeNode(node, analysis)}"); + + // Separate out several node types that, for conciseness, don't need a scope written into the source as they're + // always a single statement / block. + switch (node.Kind) { - switch (node.Kind) - { - case RegexNodeKind.Beginning: - case RegexNodeKind.Start: - case RegexNodeKind.Bol: - case RegexNodeKind.Eol: - case RegexNodeKind.End: - case RegexNodeKind.EndZ: - EmitAnchors(node); - break; + case RegexNodeKind.Beginning: + case RegexNodeKind.Start: + case RegexNodeKind.Bol: + case RegexNodeKind.Eol: + case RegexNodeKind.End: + case RegexNodeKind.EndZ: + EmitAnchors(node); + return; - case RegexNodeKind.Boundary: - case RegexNodeKind.NonBoundary: - case RegexNodeKind.ECMABoundary: - case RegexNodeKind.NonECMABoundary: - EmitBoundary(node); - break; + case RegexNodeKind.Boundary: + case RegexNodeKind.NonBoundary: + case RegexNodeKind.ECMABoundary: + case RegexNodeKind.NonECMABoundary: + EmitBoundary(node); + return; + case RegexNodeKind.One: + case RegexNodeKind.Notone: + case RegexNodeKind.Set: + EmitSingleChar(node, emitLengthChecksIfRequired); + return; + + case RegexNodeKind.Multi when !IsCaseInsensitive(node) && (node.Options & RegexOptions.RightToLeft) == 0: + EmitMultiChar(node, emitLengthChecksIfRequired); + return; + + case RegexNodeKind.UpdateBumpalong: + EmitUpdateBumpalong(node); + return; + } + + // For everything else, put the node's code into its own scope, purely for readability. If the node contains labels + // that may need to be visible outside of its scope, the scope is still emitted for clarity but is commented out. + using (EmitScope(writer, null, faux: analysis.MayBacktrack(node))) + { + switch (node.Kind) + { case RegexNodeKind.Multi: EmitMultiChar(node, emitLengthChecksIfRequired); - break; - - case RegexNodeKind.One: - case RegexNodeKind.Notone: - case RegexNodeKind.Set: - EmitSingleChar(node, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Oneloop: case RegexNodeKind.Notoneloop: case RegexNodeKind.Setloop: EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Onelazy: case RegexNodeKind.Notonelazy: case RegexNodeKind.Setlazy: EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Oneloopatomic: case RegexNodeKind.Notoneloopatomic: case RegexNodeKind.Setloopatomic: EmitSingleCharAtomicLoop(node, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Loop: EmitLoop(node); - break; + return; case RegexNodeKind.Lazyloop: EmitLazy(node); - break; + return; case RegexNodeKind.Alternate: EmitAlternation(node); - break; + return; case RegexNodeKind.Backreference: EmitBackreference(node); - break; + return; case RegexNodeKind.BackreferenceConditional: EmitBackreferenceConditional(node); - break; + return; case RegexNodeKind.ExpressionConditional: EmitExpressionConditional(node); - break; + return; case RegexNodeKind.Atomic: Debug.Assert(analysis.MayBacktrack(node.Child(0))); @@ -2156,25 +2176,20 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck case RegexNodeKind.Capture: EmitCapture(node, subsequent); - break; + return; case RegexNodeKind.PositiveLookaround: EmitPositiveLookaroundAssertion(node); - break; + return; case RegexNodeKind.NegativeLookaround: EmitNegativeLookaroundAssertion(node); - break; - - case RegexNodeKind.UpdateBumpalong: - EmitUpdateBumpalong(node); - break; - - default: - Debug.Fail($"Unexpected node type: {node.Kind}"); - break; + return; } } + + // All nodes should have been handled. + Debug.Fail($"Unexpected node type: {node.Kind}"); } // Emits the node for an atomic. @@ -2230,7 +2245,8 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe for (int i = 0; i < childCount; i++) { // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence - // and then skip the individual length checks for each. We also want to minimize the repetition of if blocks, + // and then skip the individual length checks for each. We can also discover case-insensitive sequences that + // can be checked efficiently with methods like StartsWith. We also want to minimize the repetition of if blocks, // and so we try to emit a series of clauses all part of the same if block rather than one if block per child. if ((node.Options & RegexOptions.RightToLeft) == 0 && emitLengthChecksIfRequired && @@ -2243,7 +2259,7 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe { for (; i < exclusiveEnd; i++) { - void WriteSingleCharChild(RegexNode child, bool includeDescription = true) + void WritePrefix() { if (wroteClauses) { @@ -2254,31 +2270,41 @@ void WriteSingleCharChild(RegexNode child, bool includeDescription = true) { writer.Write("if ("); } - EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true); - prevDescription = includeDescription ? DescribeNode(child, analysis) : null; - wroteClauses = true; } RegexNode child = node.Child(i); - if (child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set) + if (node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, out int nodesConsumed, out string? caseInsensitiveString)) { - WriteSingleCharChild(child); + WritePrefix(); + string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan; + writer.Write($"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(caseInsensitiveString)}, global::System.StringComparison.OrdinalIgnoreCase)"); + prevDescription = $"Match the string {Literal(caseInsensitiveString)} (ordinal case-insensitive)"; + wroteClauses = true; + + sliceStaticPos += caseInsensitiveString.Length; + i += nodesConsumed - 1; } - else if (child.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or RegexNodeKind.Oneloopatomic or - RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic or - RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloopatomic && + else if (child.Kind is RegexNodeKind.Multi && !IsCaseInsensitive(child)) + { + WritePrefix(); + EmitMultiCharString(child.Str!, caseInsensitive: false, emitLengthCheck: false, clauseOnly: true, rightToLeft: false); + prevDescription = DescribeNode(child, analysis); + wroteClauses = true; + } + else if ((child.IsOneFamily || child.IsNotoneFamily || child.IsSetFamily) && child.M == child.N && child.M <= MaxUnrollSize) { - for (int c = 0; c < child.M; c++) + int repeatCount = child.Kind is RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set ? 1 : child.M; + for (int c = 0; c < repeatCount; c++) { - WriteSingleCharChild(child, includeDescription: c == 0); + WritePrefix(); + EmitSingleChar(child, emitLengthCheck: false, clauseOnly: true); + prevDescription = c == 0 ? DescribeNode(child, analysis) : null; + wroteClauses = true; } } - else - { - break; - } + else break; } if (wroteClauses) @@ -2486,12 +2512,13 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck) { Debug.Assert(node.Kind is RegexNodeKind.Multi, $"Unexpected type: {node.Kind}"); Debug.Assert(node.Str is not null); - EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, (node.Options & RegexOptions.RightToLeft) != 0); + EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, clauseOnly: false, (node.Options & RegexOptions.RightToLeft) != 0); } - void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool rightToLeft) + void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool clauseOnly, bool rightToLeft) { Debug.Assert(str.Length >= 2); + Debug.Assert(!clauseOnly || (!emitLengthCheck && !caseInsensitive && !rightToLeft)); if (rightToLeft) { @@ -2534,9 +2561,17 @@ void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, else { string sourceSpan = sliceStaticPos > 0 ? $"{sliceSpan}.Slice({sliceStaticPos})" : sliceSpan; - using (EmitBlock(writer, $"if (!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)}))")) + string clause = $"!global::System.MemoryExtensions.StartsWith({sourceSpan}, {Literal(str)})"; + if (clauseOnly) { - Goto(doneLabel); + writer.Write(clause); + } + else + { + using (EmitBlock(writer, $"if ({clause})")) + { + Goto(doneLabel); + } } } @@ -3061,7 +3096,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) case <= RegexNode.MultiVsRepeaterLimit when node.IsOneFamily && !IsCaseInsensitive(node): // This is a repeated case-sensitive character; emit it as a multi in order to get all the optimizations // afforded to a multi, e.g. unrolling the loop with multi-char reads/comparisons at a time. - EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, rtl); + EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, clauseOnly: false, rtl); return; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 573e5e4a9358ee..aa0b223bf41a6f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -55,7 +55,8 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanLastIndexOfSpan = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanSliceIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int) })!; private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; - private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanStartsWithSpan = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanStartsWithSpanComparison = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan), typeof(ReadOnlySpan), typeof(StringComparison) })!; private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!; private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; private static readonly MethodInfo s_textInfoToLowerMethod = typeof(TextInfo).GetMethod("ToLower", new Type[] { typeof(char) })!; @@ -1404,6 +1405,10 @@ protected void EmitTryMatchAtCurrentPosition() // Generated code successfully. return; + // Whether the node has RegexOptions.IgnoreCase set. + // TODO: https://github.com/dotnet/runtime/issues/61048. We should be able to delete this and all usage sites once + // IgnoreCase is erradicated from the tree. The only place it should possibly be left after that work is in a Backreference, + // and that can do this check directly as needed. static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0; // Slices the inputSpan starting at pos until end and stores it into slice. @@ -2367,103 +2372,102 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck case RegexNodeKind.End: case RegexNodeKind.EndZ: EmitAnchors(node); - break; + return; case RegexNodeKind.Boundary: case RegexNodeKind.NonBoundary: case RegexNodeKind.ECMABoundary: case RegexNodeKind.NonECMABoundary: EmitBoundary(node); - break; + return; case RegexNodeKind.Multi: EmitMultiChar(node, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.One: case RegexNodeKind.Notone: case RegexNodeKind.Set: EmitSingleChar(node, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Oneloop: case RegexNodeKind.Notoneloop: case RegexNodeKind.Setloop: EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Onelazy: case RegexNodeKind.Notonelazy: case RegexNodeKind.Setlazy: EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Oneloopatomic: case RegexNodeKind.Notoneloopatomic: case RegexNodeKind.Setloopatomic: EmitSingleCharAtomicLoop(node); - break; + return; case RegexNodeKind.Loop: EmitLoop(node); - break; + return; case RegexNodeKind.Lazyloop: EmitLazy(node); - break; + return; case RegexNodeKind.Alternate: EmitAlternation(node); - break; + return; case RegexNodeKind.Concatenate: EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); - break; + return; case RegexNodeKind.Atomic: EmitAtomic(node, subsequent); - break; + return; case RegexNodeKind.Backreference: EmitBackreference(node); - break; + return; case RegexNodeKind.BackreferenceConditional: EmitBackreferenceConditional(node); - break; + return; case RegexNodeKind.ExpressionConditional: EmitExpressionConditional(node); - break; + return; case RegexNodeKind.Capture: EmitCapture(node, subsequent); - break; + return; case RegexNodeKind.PositiveLookaround: EmitPositiveLookaroundAssertion(node); - break; + return; case RegexNodeKind.NegativeLookaround: EmitNegativeLookaroundAssertion(node); - break; + return; case RegexNodeKind.Nothing: BrFar(doneLabel); - break; + return; case RegexNodeKind.Empty: // Emit nothing. - break; + return; case RegexNodeKind.UpdateBumpalong: EmitUpdateBumpalong(node); - break; - - default: - Debug.Fail($"Unexpected node type: {node.Kind}"); - break; + return; } + + // All nodes should have been handled. + Debug.Fail($"Unexpected node type: {node.Kind}"); } // Emits the node for an atomic. @@ -2534,19 +2538,44 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe Debug.Assert(node.Kind is RegexNodeKind.Concatenate, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}"); - bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; - // Emit the code for each child one after the other. int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence - // and then skip the individual length checks for each. - if (!rtl && emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) + // and then skip the individual length checks for each. We can also discover case-insensitive sequences that + // can be checked efficiently with methods like StartsWith. + if ((node.Options & RegexOptions.RightToLeft) == 0 && + emitLengthChecksIfRequired && + node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) { EmitSpanLengthCheck(requiredLength); for (; i < exclusiveEnd; i++) { + if (node.TryGetOrdinalCaseInsensitiveString(i, exclusiveEnd, out int nodesConsumed, out string? caseInsensitiveString)) + { + // if (!sliceSpan.Slice(sliceStaticPause).StartsWith(caseInsensitiveString, StringComparison.OrdinalIgnoreCase)) goto doneLabel; + if (sliceStaticPos > 0) + { + Ldloca(slice); + Ldc(sliceStaticPos); + Call(s_spanSliceIntMethod); + } + else + { + Ldloc(slice); + } + Ldstr(caseInsensitiveString); + Call(s_stringAsSpanMethod); + Ldc((int)StringComparison.OrdinalIgnoreCase); + Call(s_spanStartsWithSpanComparison); + BrfalseFar(doneLabel); + + sliceStaticPos += caseInsensitiveString.Length; + i += nodesConsumed - 1; + continue; + } + EmitNode(node.Child(i), GetSubsequent(i, node, subsequent), emitLengthChecksIfRequired: false); } @@ -2910,7 +2939,7 @@ void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, Call(s_spanSliceIntMethod); Ldstr(str); Call(s_stringAsSpanMethod); - Call(s_spanStartsWith); + Call(s_spanStartsWithSpan); BrfalseFar(doneLabel); sliceStaticPos += str.Length; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 3a0ead32a35d5b..3b8b25539c2bef 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2384,6 +2384,84 @@ RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or } } + /// + /// Determines whether the specified child index of a concatenation begins a sequence whose values + /// should be used to perform an ordinal case-insensitive comparison. + /// + /// The index of the child with which to start the sequence. + /// The exclusive upper bound on the child index to iterate to. + /// How many nodes make up the sequence, if any. + /// The string to use for an ordinal case-insensitive comparison, if any. + /// true if a sequence was found; otherwise, false. + public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChildBound, out int nodesConsumed, [NotNullWhen(true)] out string? caseInsensitiveString) + { + Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}"); + + var vsb = new ValueStringBuilder(stackalloc char[32]); + + // We're looking in particular for sets of ASCII characters, so we focus only on sets with two characters in them, e.g. [Aa]. + Span twoChars = stackalloc char[2]; + + // Iterate from the child index to the exclusive upper bound. + int i = childIndex; + for ( ; i < exclusiveChildBound; i++) + { + // We focus on only Ones and Sets. A sequence of Ones should have already been turned into a Multi, + // so we don't bother looking for Oneloop/etc. + RegexNode child = Child(i); + if (child.Kind is RegexNodeKind.One) + { + // We only want to include ASCII characters, and only if they don't participate in case conversion + // such that they only case to themselves and nothing other cases to them. Otherwise, including + // them would potentially cause us to match against things not allowed by the pattern. + if (child.Ch >= 128 || + RegexCharClass.ParticipatesInCaseConversion(child.Ch)) + { + break; + } + vsb.Append(child.Ch); + } + else if (child.Kind is RegexNodeKind.Set || + (child.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic && child.M == child.N)) + { + // In particular we want to look for sets that contain only the upper and lowercase variant + // of the same ASCII letter. + if (RegexCharClass.IsNegated(child.Str!) || + RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 || + twoChars[0] >= 128 || + twoChars[1] >= 128 || + twoChars[0] == twoChars[1] || + !char.IsLetter(twoChars[0]) || + !char.IsLetter(twoChars[1]) || + ((twoChars[0] | 0x20) != (twoChars[1] | 0x20))) + { + break; + } + + vsb.Append((char)(twoChars[0] | 0x20), child.Kind is RegexNodeKind.Set ? 1 : child.M); + } + else + { + break; + } + } + + // If we found at least two characters, consider it a sequence found. It's possible + // they all came from the same node, so this could be a sequence of just one node. + if (vsb.Length >= 2) + { + caseInsensitiveString = vsb.ToString(); + nodesConsumed = i - childIndex; + return true; + } + + // No sequence found. + caseInsensitiveString = null; + nodesConsumed = 0; + vsb.Dispose(); + return false; + } + /// /// Determine whether the specified child node is the beginning of a sequence that can /// trivially have length checks combined in order to avoid bounds checks. @@ -2403,6 +2481,8 @@ RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or /// public bool TryGetJoinableLengthCheckChildRange(int childIndex, out int requiredLength, out int exclusiveEnd) { + Debug.Assert(Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {Kind}"); + static bool CanJoinLengthCheck(RegexNode node) => node.Kind switch { RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set => true, From 5622dc23141ea17173f733a54c9e5c4016e0207a Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 8 Mar 2022 20:47:02 -0500 Subject: [PATCH 2/2] Address PR feedback --- .../Text/RegularExpressions/RegexCharClass.cs | 14 +++++++++++ .../Text/RegularExpressions/RegexNode.cs | 23 +++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 63eddeaa977f08..a85ecef2303173 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -887,6 +887,20 @@ public static bool ParticipatesInCaseConversion(ReadOnlySpan s) return false; } + /// Gets whether the specified span contains only ASCII. + public static bool IsAscii(ReadOnlySpan s) // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available + { + foreach (char c in s) + { + if (c >= 128) + { + return false; + } + } + + return true; + } + /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents. /// This may enumerate negated characters if the set is negated. private static bool CanEasilyEnumerateSetContents(string set) => diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 3b8b25539c2bef..12afbb48f663d5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2406,9 +2406,15 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil int i = childIndex; for ( ; i < exclusiveChildBound; i++) { - // We focus on only Ones and Sets. A sequence of Ones should have already been turned into a Multi, - // so we don't bother looking for Oneloop/etc. RegexNode child = Child(i); + if ((child.Options & RegexOptions.IgnoreCase) != 0) + { + // TODO https://github.com/dotnet/runtime/issues/61048: Remove this block once fixed. + // We don't want any nodes that are still IgnoreCase, as they'd no longer be IgnoreCase if + // they were applicable to this optimization. + break; + } + if (child.Kind is RegexNodeKind.One) { // We only want to include ASCII characters, and only if they don't participate in case conversion @@ -2419,8 +2425,21 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil { break; } + vsb.Append(child.Ch); } + else if (child.Kind is RegexNodeKind.Multi) + { + // As with RegexNodeKind.One, the string needs to be composed solely of ASCII characters that + // don't participate in case conversion. + if (!RegexCharClass.IsAscii(child.Str.AsSpan()) || + RegexCharClass.ParticipatesInCaseConversion(child.Str.AsSpan())) + { + break; + } + + vsb.Append(child.Str); + } else if (child.Kind is RegexNodeKind.Set || (child.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic && child.M == child.N)) {