From 8fa1133921c3a5cbc6bbb830ec64feab27839f90 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 5 Apr 2022 22:41:11 -0400 Subject: [PATCH] Improve handling of more common Regex sets - Today for a set like [\r\n], we'll emit a comparison that compares the char to each of '\r' and '\n', but for a set like [^\r\n], we end up falling back to emitting a lookup table. With this PR, we simply use the existing support for the non-negating case, just negating the result. - Today for a set like [\p{IsGreek}\p{IsGreekExtended}] that ends up being two ranges, we'll fall back to our lookup table. With this PR, we'll emit it as two range checks. - Today for a set like [A-Za-z], we'll fall back to our lookup table. As a special case of two-range support, with this PR we'll now recognize that these ranges are just one bit flip away from each other, and we'll employ the normal ASCII casing to do a single range comparison against the input or'd with a mask. - Today as a fallback, we employ a lookup table stored in a string; this requires a bounds check, dereferencing the string object, doing the math to find the right index, doing the math to find the right bit, etc. With this PR, for sets composed only of ranges where the exclusiveMax - inclusiveMin <= 64, with this PR we'll now emit it as a lookup into a ulong that's done in a branchless fashion and is much faster. - It appears to be relatively common for folks to use [\d\D], [\w\W], or [\s\S] as a simple way of saying "match anything"; RegexOptions.Singleline changes '.' to mean this as well. We already have special handling for '.' with Singleline as "AnyClass"... this just normalizes those other common representations into the same shape so that everyhing else recognizes them accordingly. - Today when we see an AnyClass, we emit a nonsense comparison that always results in true (or false for negations); that's because, for a while, the expression given to the matching routine may have had side effects. There are no longer side effects, though, so it's ok to just emit "true" or "false" directly and make the operation cheaper. - For every optimization we have in MatchCharacterClass, we should always be able to handle negation trivially. - Handle character classes composed of multiple UnicodeCategories. This helps with composed categories, like \p{N}. - Fix hard-coded char class strings for \W and \S. There are multiple ways to invert a RegexCharClass string: you can invert the whole string by just setting the invert flag, or you can invert all the individual components, e.g. if the string is composed of only categories, invert each category. The hardcoded string the parser uses when you write \W simply sets the negated bit, but this causes problems if \W is used as [\W], because then the individual components are added into a larger set that doesn't have negation set. And that means \W and [\W] result in different strings, which means any place we special-case the string for \W, we don't recognize [\W]. The same applies to \S. This commit changes the hardcoded string for \W and \S to use the more canonical form. Also, the implementation generally uses "set" and "class" interchangeably, but when specifying the ECMA-related strings, it uses "set" to actually mean "ranges", which is very confusing. I've changed them. --- .../gen/RegexGenerator.Emitter.cs | 159 +++++++--- .../Text/RegularExpressions/RegexCharClass.cs | 272 +++++++++++++----- .../Text/RegularExpressions/RegexCompiler.cs | 249 +++++++++++++--- .../Text/RegularExpressions/RegexNode.cs | 14 + .../FunctionalTests/RegexCharacterSetTests.cs | 16 +- 5 files changed, 545 insertions(+), 165 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index c65e5187a5b210..0e9ac4f2f3031a 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -340,7 +340,7 @@ private static (bool NeedsTryFind, bool NeedsTryMatch) EmitScan(IndentedTextWrit // empty, it's helpful as a learning exposition tool. writer.WriteLine("// The pattern never matches anything."); } - else if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set && (root.Options & RegexOptions.IgnoreCase) == 0) + else if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set) { // If the whole expression is just one or more characters, we can rely on the FindOptimizations spitting out // an IndexOf that will find the exact sequence or not, and we don't need to do additional checking beyond that. @@ -3776,14 +3776,16 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s // but that call is relatively expensive. Before we fall back to it, we try to optimize // some common cases for which we can do much better, such as known character classes // for which we can call a dedicated method, or a fast-path for ASCII using a lookup table. + // In some cases, multiple optimizations are possible for a given character class: the checks + // in this method are generally ordered from fastest / simplest to slowest / most complex so + // that we get the best optimization for a given char class. // First, see if the char class is a built-in one for which there's a better function // we can just call directly. switch (charClass) { case RegexCharClass.AnyClass: - // ideally this could just be "return true;", but we need to evaluate the expression for its side effects - return $"({chExpr} {(negate ? "<" : ">=")} 0)"; // a char is unsigned and thus won't ever be negative + return negate ? "false" : "true"; // This assumes chExpr never has side effects. case RegexCharClass.DigitClass: case RegexCharClass.NotDigitClass: @@ -3811,60 +3813,127 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s $"(((uint){chExpr}) - {Literal(lowInclusive)} {(negate ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))"; } - // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and + // Next, if the character class contains nothing but Unicode categories, we can call char.GetUnicodeCategory and // compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus - // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass. - if (RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated)) + // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass, + // but without the optimizations the C# compiler will provide for switches. + Span categories = stackalloc UnicodeCategory[30]; // number of UnicodeCategory values (though it's unheard of to have a set with all of them) + if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negated)) { + // TODO https://github.com/dotnet/roslyn/issues/58246: Use pattern matching instead of switch once C# code gen quality improves. negate ^= negated; - return $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} UnicodeCategory.{category})"; + return numCategories == 1 ? + $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} UnicodeCategory.{categories[0]})" : + $"(char.GetUnicodeCategory({chExpr}) switch {{ {string.Join(" or ", categories.Slice(0, numCategories).ToArray().Select(c => $"UnicodeCategory.{c}"))} => {(negate ? "false" : "true")}, _ => {(negate ? "true" : "false")} }})"; } // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), // it may be cheaper and smaller to compare against each than it is to use a lookup table. We can also special-case // the very common case with case insensitivity of two characters next to each other being the upper and lowercase // ASCII variants of each other, in which case we can use bit manipulation to avoid a comparison. - if (!RegexCharClass.IsNegated(charClass)) + Span setChars = stackalloc char[3]; + int mask; + switch (RegexCharClass.GetSetChars(charClass, setChars)) { - Span setChars = stackalloc char[3]; - int mask; - switch (RegexCharClass.GetSetChars(charClass, setChars)) + case 2: + negate ^= RegexCharClass.IsNegated(charClass); + if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) + { + return $"(({chExpr} | 0x{mask:X}) {(negate ? "!=" : "==")} {Literal((char)(setChars[1] | mask))})"; + } + additionalDeclarations.Add("char ch;"); + return negate ? + $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}))" : + $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; + + case 3: + negate ^= RegexCharClass.IsNegated(charClass); + additionalDeclarations.Add("char ch;"); + return (negate, RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) switch + { + (false, false) => $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))", + (true, false) => $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))", + (false, true) => $"((((ch = {chExpr}) | 0x{mask:X}) == {Literal((char)(setChars[1] | mask))}) | (ch == {Literal(setChars[2])}))", + (true, true) => $"((((ch = {chExpr}) | 0x{mask:X}) != {Literal((char)(setChars[1] | mask))}) & (ch != {Literal(setChars[2])}))", + }; + } + + // Next, handle simple sets of two ASCII letter ranges that are cased versions of each other, e.g. [A-Za-z]. + // This can be implemented as if it were a single range, with an additional bitwise operation. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) rangeLower, out (char LowInclusive, char HighInclusive) rangeUpper) && + RegexCharClass.IsAsciiLetter(rangeUpper.LowInclusive) && + RegexCharClass.IsAsciiLetter(rangeUpper.HighInclusive) && + (rangeLower.LowInclusive | 0x20) == rangeUpper.LowInclusive && + (rangeLower.HighInclusive | 0x20) == rangeUpper.HighInclusive) + { + Debug.Assert(rangeLower.LowInclusive != rangeUpper.LowInclusive); + negate ^= RegexCharClass.IsNegated(charClass); + return $"((uint)(({chExpr} | 0x20) - {Literal(rangeUpper.LowInclusive)}) {(negate ? ">" : "<=")} (uint)({Literal(rangeUpper.HighInclusive)} - {Literal(rangeUpper.LowInclusive)}))"; + } + + // Analyze the character set more to determine what code to generate. + RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass); + + // Next, handle sets where the high - low + 1 range is <= 64. In that case, we can emit + // a branchless lookup in a ulong that does not rely on loading any objects (e.g. the string-based + // lookup we use later). This nicely handles common sets like [0-9A-Fa-f], [0-9a-f], [A-Za-z], etc. + if (analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges - analysis.LowerBoundInclusiveIfOnlyRanges) <= 64) + { + additionalDeclarations.Add("ulong charMinusLow;"); + + // Create the 64-bit value with 1s at indices corresponding to every character in the set, + // where the bit is computed to be the char value minus the lower bound starting from + // most significant bit downwards. + bool negatedClass = RegexCharClass.IsNegated(charClass); + ulong bitmap = 0; + for (int i = analysis.LowerBoundInclusiveIfOnlyRanges; i < analysis.UpperBoundExclusiveIfOnlyRanges; i++) { - case 2: - if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) - { - return $"(({chExpr} | 0x{mask:X}) {(negate ? "!=" : "==")} {Literal((char)(setChars[1] | mask))})"; - } - additionalDeclarations.Add("char ch;"); - return negate ? - $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}))" : - $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; - - case 3: - additionalDeclarations.Add("char ch;"); - return (negate, RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) switch - { - (false, false) => $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))", - (true, false) => $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))", - (false, true) => $"((((ch = {chExpr}) | 0x{mask:X}) == {Literal((char)(setChars[1] | mask))}) | (ch == {Literal(setChars[2])}))", - (true, true) => $"((((ch = {chExpr}) | 0x{mask:X}) != {Literal((char)(setChars[1] | mask))}) & (ch != {Literal(setChars[2])}))", - }; + if (RegexCharClass.CharInClass((char)i, charClass) ^ negatedClass) + { + bitmap |= (1ul << (63 - (i - analysis.LowerBoundInclusiveIfOnlyRanges))); + } } + + // To determine whether a character is in the set, we subtract the lowest char (casting to + // uint to account for any smaller values); this subtraction happens before the result is + // zero-extended to ulong, meaning that `charMinusLow` will always have upper 32 bits equal to 0. + // We then left shift the constant with this offset, and apply a bitmask that has the highest + // bit set (the sign bit) if and only if `chExpr` is in the [low, low + 64) range. + // Then we only need to check whether this final result is less than 0: this will only be + // the case if both `charMinusLow` was in fact the index of a set bit in the constant, and also + // `chExpr` was in the allowed range (this ensures that false positive bit shifts are ignored). + negate ^= negatedClass; + return $"((long)((0x{bitmap:X}UL << (int)(charMinusLow = (uint){chExpr} - {Literal((char)analysis.LowerBoundInclusiveIfOnlyRanges)})) & (charMinusLow - 64)) {(negate ? ">=" : "<")} 0)"; } // All options after this point require a ch local. additionalDeclarations.Add("char ch;"); - // Analyze the character set more to determine what code to generate. - RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass); + // Next, handle simple sets of two ranges, e.g. [\p{IsGreek}\p{IsGreekExtended}]. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) range0, out (char LowInclusive, char HighInclusive) range1)) + { + negate ^= RegexCharClass.IsNegated(charClass); + + string range0Clause = range0.LowInclusive == range0.HighInclusive ? + $"((ch = {chExpr}) {(negate ? "!=" : "==")} {Literal(range0.LowInclusive)})" : + $"((uint)((ch = {chExpr}) - {Literal(range0.LowInclusive)}) {(negate ? ">" : "<=")} (uint)({Literal(range0.HighInclusive)} - {Literal(range0.LowInclusive)}))"; + + string range1Clause = range1.LowInclusive == range1.HighInclusive ? + $"(ch {(negate ? "!=" : "==")} {Literal(range1.LowInclusive)})" : + $"((uint)(ch - {Literal(range1.LowInclusive)}) {(negate ? ">" : "<=")} (uint)({Literal(range1.HighInclusive)} - {Literal(range1.LowInclusive)}))"; + + return negate ? + $"({range0Clause} & {range1Clause})" : + $"({range0Clause} | {range1Clause})"; + } if (analysis.ContainsNoAscii) { // We determined that the character class contains only non-ASCII, - // for example if the class were [\p{IsGreek}\p{IsGreekExtended}], which is - // the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly - // extend the analysis to produce a known lower-bound and compare against - // that rather than always using 128 as the pivot point.) + // for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000]. + // (In the future, we could possibly extend the analysis to produce a known + // lower-bound and compare against that rather than always using 128 as the + // pivot point.) return negate ? $"((ch = {chExpr}) < 128 || !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : $"((ch = {chExpr}) >= 128 && RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; @@ -3912,19 +3981,18 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s if (analysis.ContainsOnlyAscii) { - // We know that all inputs that could match are ASCII, for example if the - // character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we - // can just fail the comparison. + // If all inputs that could match are ASCII, we only need the lookup table, guarded + // by a check for the upper bound (which serves both to limit for what characters + // we need to access the lookup table and to bounds check the lookup table access). return negate ? - $"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : - $"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; + $"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfOnlyRanges)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : + $"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfOnlyRanges)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; } if (analysis.AllNonAsciiContained) { - // We know that all non-ASCII inputs match, for example if the character - // class were [^\r\n], so since we just determined the ch to be >= 128, we can just - // give back success. + // If every non-ASCII value is considered a match, we can immediately succeed for any + // non-ASCII inputs, and access the lookup table for the rest. return negate ? $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; @@ -3932,7 +4000,8 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s // We know that the whole class wasn't ASCII, and we don't know anything about the non-ASCII // characters other than that some might be included, for example if the character class - // were [\w\d], so since ch >= 128, we need to fall back to calling CharInClass. + // were [\w\d], so if ch >= 128, we need to fall back to calling CharInClass, otherwise use + // the lookup table. return negate ? $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 219d9823528651..3e341c7655e9b1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -23,7 +23,10 @@ namespace System.Text.RegularExpressions // and see where the character should go. Based on whether the ending index is odd or even, // we know if the character is in the set. // m+1...n The categories. This is a list of UnicodeCategory enum values which describe categories - // included in this class. + // included in this class. These can either be individual values (either UnicodeCategory - 1 + // for inclusive values, or -1 - UnicodeCategory for exclusive values), or a "group", which + // is a contiguous sequence of such values surrounded by \0 values; all values in the group + // have the same positive/negative orientation. /// Provides the "set of Unicode chars" functionality used by the regexp engine. internal sealed partial class RegexCharClass @@ -34,43 +37,49 @@ internal sealed partial class RegexCharClass internal const int CategoryLengthIndex = 2; internal const int SetStartIndex = 3; // must be odd for subsequent logic to work - private const string NullCharString = "\0"; - private const char NullChar = '\0'; internal const char LastChar = '\uFFFF'; internal const short SpaceConst = 100; private const short NotSpaceConst = -100; private const string InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__"; - private const string Space = "\x64"; - private const string NotSpace = "\uFF9C"; - private const string Word = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; - private const string NotWord = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; + private const string SpaceCategories = "\x64"; + private const string NotSpaceCategories = "\uFF9C"; + private const string WordCategories = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; + private const string NotWordCategories = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; internal const string SpaceClass = "\u0000\u0000\u0001\u0064"; - internal const string NotSpaceClass = "\u0001\u0000\u0001\u0064"; + internal const string NotSpaceClass = "\u0000\u0000\u0001\uFF9C"; internal const string WordClass = "\u0000\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; - internal const string NotWordClass = "\u0001\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; + internal const string NotWordClass = "\u0000\u0000\u000A\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; internal const string DigitClass = "\u0000\u0000\u0001\u0009"; internal const string NotDigitClass = "\u0000\u0000\u0001\uFFF7"; - private const string ECMASpaceSet = "\u0009\u000E\u0020\u0021"; - private const string NotECMASpaceSet = "\0\u0009\u000E\u0020\u0021"; - private const string ECMAWordSet = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; - private const string NotECMAWordSet = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; - private const string ECMADigitSet = "\u0030\u003A"; - private const string NotECMADigitSet = "\0\u0030\u003A"; + private const string ECMASpaceRanges = "\u0009\u000E\u0020\u0021"; + private const string NotECMASpaceRanges = "\0\u0009\u000E\u0020\u0021"; + private const string ECMAWordRanges = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; + private const string NotECMAWordRanges = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; + private const string ECMADigitRanges = "\u0030\u003A"; + private const string NotECMADigitRanges = "\0\u0030\u003A"; - internal const string ECMASpaceClass = "\x00\x04\x00" + ECMASpaceSet; - internal const string NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceSet; - internal const string ECMAWordClass = "\x00\x0A\x00" + ECMAWordSet; - internal const string NotECMAWordClass = "\x01\x0A\x00" + ECMAWordSet; - internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitSet; - internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet; + internal const string ECMASpaceClass = "\x00\x04\x00" + ECMASpaceRanges; + internal const string NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceRanges; + internal const string ECMAWordClass = "\x00\x0A\x00" + ECMAWordRanges; + internal const string NotECMAWordClass = "\x01\x0A\x00" + ECMAWordRanges; + internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitRanges; + internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitRanges; internal const string AnyClass = "\x00\x01\x00\x00"; private const string EmptyClass = "\x00\x00\x00"; + // Sets regularly used as a canonical way to express the equivalent of '.' with Singleline when Singleline isn't in use. + internal const string WordNotWordClass = "\u0000\u0000\u0014\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; + internal const string NotWordWordClass = "\u0000\u0000\u0014\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; + internal const string DigitNotDigitClass = "\u0000\u0000\u0002\u0009\uFFF7"; + internal const string NotDigitDigitClass = "\u0000\u0000\u0002\uFFF7\u0009"; + internal const string SpaceNotSpaceClass = "\u0000\u0000\u0002\u0064\uFF9C"; + internal const string NotSpaceSpaceClass = "\u0000\u0000\u0002\uFF9C\u0064"; + // UnicodeCategory is zero based, so we add one to each value and subtract it off later private const int DefinedCategoriesCapacity = 38; private static readonly Dictionary s_definedCategories = new Dictionary(DefinedCategoriesCapacity) @@ -93,7 +102,7 @@ internal sealed partial class RegexCharClass // InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter} // !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!! - { "__InternalRegexIgnoreCase__", "\u0000\u0002\u0003\u0001\u0000" }, + { InternalRegexIgnoreCase, "\u0000\u0002\u0003\u0001\u0000" }, // Marks { "Mc", "\u0007" }, // UnicodeCategory.SpacingCombiningMark + 1 @@ -347,9 +356,9 @@ private StringBuilder EnsureCategories() => _rangelist ??= new List<(char First, char Last)>(6); /// - /// Adds a set (specified by its string representation) to the class. + /// Adds ranges (specified by their range string representation) to the class. /// - private void AddSet(ReadOnlySpan set) + private void AddRanges(ReadOnlySpan set) { if (set.Length == 0) { @@ -410,7 +419,7 @@ public void AddCategoryFromName(string categoryName, bool invert, bool caseInsen } else { - AddSet(SetFromProperty(categoryName, invert, pattern, currentPos)); + AddRanges(RangesFromProperty(categoryName, invert, pattern, currentPos)); } } @@ -469,11 +478,11 @@ public void AddWord(bool ecma, bool negate) { if (ecma) { - AddSet((negate ? NotECMAWordSet : ECMAWordSet).AsSpan()); + AddRanges((negate ? NotECMAWordRanges : ECMAWordRanges).AsSpan()); } else { - AddCategory(negate ? NotWord : Word); + AddCategory(negate ? NotWordCategories : WordCategories); } } @@ -481,11 +490,11 @@ public void AddSpace(bool ecma, bool negate) { if (ecma) { - AddSet((negate ? NotECMASpaceSet : ECMASpaceSet).AsSpan()); + AddRanges((negate ? NotECMASpaceRanges : ECMASpaceRanges).AsSpan()); } else { - AddCategory(negate ? NotSpace : Space); + AddCategory(negate ? NotSpaceCategories : SpaceCategories); } } @@ -493,7 +502,7 @@ public void AddDigit(bool ecma, bool negate, string pattern, int currentPos) { if (ecma) { - AddSet((negate ? NotECMADigitSet : ECMADigitSet).AsSpan()); + AddRanges((negate ? NotECMADigitRanges : ECMADigitRanges).AsSpan()); } else { @@ -581,42 +590,118 @@ public static bool IsSingletonInverse(string set) => !IsSubtraction(set) && (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]); - /// Gets whether the set contains nothing other than a single UnicodeCategory (it may be negated). - /// The set to examine. - /// The single category if there was one. - /// true if the single category is a not match. - /// true if a single category could be obtained; otherwise, false. - public static bool TryGetSingleUnicodeCategory(string set, out UnicodeCategory category, out bool negated) + /// + /// Gets the categories from a set if the set is only categories (no ranges, no subtraction), + /// they all share the same negation status (not doing so is rare), and they all fit in the destination span. + /// + /// The character class to examine. + /// The destination span into which the categories should be written. + /// The number of categories written to . + /// false if the categories written to represent inclusions; true if they represent exclusions. + /// true if the categories could be retrieved; otherwise, false. + public static bool TryGetOnlyCategories(string set, Span categories, out int numCategories, out bool negated) { - if (set[CategoryLengthIndex] == 1 && - set[SetLengthIndex] == 0 && - !IsSubtraction(set)) + negated = false; + numCategories = 0; + bool sawFirstCategory = false; + + // Require that the character class has no ranges, has no subtraction, and has categories. + int categoryLength = set[CategoryLengthIndex]; + if (categoryLength == 0 || set[SetLengthIndex] != 0 || IsSubtraction(set)) { - short c = (short)set[SetStartIndex]; + return false; + } + // Loop through all categories, storing them into the categories span. + int categoryEnd = SetStartIndex + set[CategoryLengthIndex]; + for (int pos = SetStartIndex; pos < categoryEnd; pos++) + { + // Get the next category value. + short c = (short)set[pos]; if (c > 0) { - if (c != SpaceConst) + // It's a positive (inclusive) value. Make sure all previous categories seen are also positive. + // Also make sure it's not the fake space category, which consumers don't handle as it's + // not a real UnicodeCategory. + if ((sawFirstCategory && negated) || + c == SpaceConst || + numCategories == categories.Length) { - category = (UnicodeCategory)(c - 1); - negated = IsNegated(set); - return true; + return false; } + + sawFirstCategory = true; + categories[numCategories++] = (UnicodeCategory)(c - 1); } else if (c < 0) { - if (c != NotSpaceConst) + // It's a negative (exclusive) value. Make sure all previous categories seen are also negative. + // Also make sure it's not the fake non-space category, which consumers don't handle as it's + // not a real UnicodeCategory. + if ((sawFirstCategory && !negated) || + c == NotSpaceConst || + numCategories == categories.Length) { - category = (UnicodeCategory)(-1 - c); - negated = !IsNegated(set); - return true; + return false; + } + + sawFirstCategory = true; + negated = true; + categories[numCategories++] = (UnicodeCategory)(-1 - c); + } + else // c == 0 + { + // It's the start of a group. Every value in the group needs to have the same orientation. + // We stop when we hit the next 0. + c = (short)set[++pos]; + Debug.Assert(c != 0); + if (c > 0) + { + if (sawFirstCategory && negated) + { + return false; + } + sawFirstCategory = true; + + do + { + if (numCategories == categories.Length) + { + return false; + } + + categories[numCategories++] = (UnicodeCategory)(c - 1); + c = (short)set[++pos]; + } + while (c != 0); + } + else + { + if (sawFirstCategory && !negated) + { + return false; + } + negated = true; + sawFirstCategory = true; + + do + { + if (numCategories == categories.Length) + { + return false; + } + + categories[numCategories++] = (UnicodeCategory)(-1 - c); + c = (short)set[++pos]; + } + while (c != 0); } } } - category = default; - negated = false; - return false; + // Factor in whether the entire character class is itself negated. + negated ^= IsNegated(set); + return true; } /// Attempts to get a single range stored in the set. @@ -652,6 +737,32 @@ public static bool TryGetSingleRange(string set, out char lowInclusive, out char return false; } + /// Attempts to get two ranges stored in the set. The set may be negated. + /// The set. + /// The first result range. + /// The second result range. + /// true if the set contained exactly two ranges; otherwise, false. + public static bool TryGetDoubleRange( + string set, + out (char LowInclusive, char HighInclusive) range0, + out (char LowInclusive, char HighInclusive) range1) + { + if (set[CategoryLengthIndex] == 0 && // must not have any categories + set.Length == SetStartIndex + set[SetLengthIndex]) // and no subtraction + { + int setLength = set[SetLengthIndex]; + if (setLength is 3 or 4) + { + range0 = (set[SetStartIndex], (char)(set[SetStartIndex + 1] - 1)); + range1 = (set[SetStartIndex + 2], setLength == 3 ? LastChar : (char)(set[SetStartIndex + 3] - 1)); + return true; + } + } + + range0 = range1 = ('\0', '\0'); + return false; + } + /// Gets all of the characters in the specified set, storing them into the provided span. /// The character class. /// The span into which the chars should be stored. @@ -853,6 +964,10 @@ public static bool IsAscii(ReadOnlySpan s) // TODO https://github.com/dotn return true; } + /// Gets whether the specified character is an ASCII letter. + public static bool IsAsciiLetter(char c) => // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available + (uint)((c | 0x20) - 'a') <= 'z' - 'a'; + /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents. /// This may enumerate negated characters if the set is negated. private static bool CanEasilyEnumerateSetContents(string set) => @@ -865,16 +980,26 @@ private static bool CanEasilyEnumerateSetContents(string set) => /// Provides results from . internal struct CharClassAnalysisResults { + /// true if the set contains only ranges; false if it contains Unicode categories and/or subtraction. + public bool OnlyRanges; /// true if we know for sure that the set contains only ASCII values; otherwise, false. + /// This can only be true if is true. public bool ContainsOnlyAscii; /// true if we know for sure that the set doesn't contain any ASCII values; otherwise, false. + /// This can only be true if is true. public bool ContainsNoAscii; /// true if we know for sure that all ASCII values are in the set; otherwise, false. + /// This can only be true if is true. public bool AllAsciiContained; /// true if we know for sure that all non-ASCII values are in the set; otherwise, false. + /// This can only be true if is true. public bool AllNonAsciiContained; - /// The exclusive upper bound. Only valid if is true. - public int UpperBoundExclusiveIfContainsOnlyAscii; + /// The inclusive lower bound. + /// This is only valid if is true. + public int LowerBoundInclusiveIfOnlyRanges; + /// The exclusive upper bound. + /// This is only valid if is true. + public int UpperBoundExclusiveIfOnlyRanges; } /// Analyzes the set to determine some basic properties that can be used to optimize usage. @@ -901,10 +1026,13 @@ internal static CharClassAnalysisResults Analyze(string set) // everything ASCII is included. return new CharClassAnalysisResults { + OnlyRanges = true, AllNonAsciiContained = set[set.Length - 1] < 128, AllAsciiContained = set[SetStartIndex] >= 128, ContainsNoAscii = false, - ContainsOnlyAscii = false + ContainsOnlyAscii = false, + LowerBoundInclusiveIfOnlyRanges = set[SetStartIndex], + UpperBoundExclusiveIfOnlyRanges = set[set.Length - 1], }; } @@ -912,11 +1040,13 @@ internal static CharClassAnalysisResults Analyze(string set) // Similarly if the lower bound is non-ASCII, that means no ASCII is in the class. return new CharClassAnalysisResults { + OnlyRanges = true, AllNonAsciiContained = false, AllAsciiContained = false, ContainsOnlyAscii = set[set.Length - 1] <= 128, ContainsNoAscii = set[SetStartIndex] >= 128, - UpperBoundExclusiveIfContainsOnlyAscii = set[set.Length - 1], + LowerBoundInclusiveIfOnlyRanges = set[SetStartIndex], + UpperBoundExclusiveIfOnlyRanges = set[set.Length - 1], }; } @@ -1220,34 +1350,30 @@ private static bool CharInCategoryGroup(UnicodeCategory chcategory, string categ { int pos = i + 1; int curcat = (short)category[pos]; - bool result; if (curcat > 0) { // positive case - the character must be in ANY of the categories in the group result = false; - for (; curcat != 0; curcat = (short)category[pos]) + do { - pos++; - if (!result && chcategory == (UnicodeCategory)(curcat - 1)) - { - result = true; - } + result |= chcategory == (UnicodeCategory)(curcat - 1); + curcat = (short)category[++pos]; } + while (curcat != 0); } else { // negative case - the character must be in NONE of the categories in the group + Debug.Assert(curcat < 0); result = true; - for (; curcat != 0; curcat = (short)category[pos]) + do { - pos++; - if (result && chcategory == (UnicodeCategory)(-1 - curcat)) - { - result = false; - } + result &= chcategory != (UnicodeCategory)(-1 - curcat); + curcat = (short)category[++pos]; } + while (curcat != 0); } i = pos; @@ -1559,7 +1685,7 @@ _subtractor is null && } } - private static ReadOnlySpan SetFromProperty(string capname, bool invert, string pattern, int currentPos) + private static ReadOnlySpan RangesFromProperty(string capname, bool invert, string pattern, int currentPos) { int min = 0; int max = s_propTable.Length; @@ -1581,8 +1707,8 @@ private static ReadOnlySpan SetFromProperty(string capname, bool invert, s Debug.Assert(!string.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table"); return !invert ? set.AsSpan() : - set[0] == NullChar ? set.AsSpan(1) : - (NullCharString + set).AsSpan(); + set[0] == '\0' ? set.AsSpan(1) : + ("\0" + set).AsSpan(); } } @@ -1671,11 +1797,11 @@ public static string DescribeSet(string set) if (!found) { - if (group.Equals(Word)) + if (group.Equals(WordCategories)) { desc.Append("\\w"); } - else if (group.Equals(NotWord)) + else if (group.Equals(NotWordCategories)) { desc.Append("\\W"); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index e6d1e47fb6b9f6..2fa72def6f27fe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -4508,7 +4508,7 @@ protected void EmitScan(RegexOptions options, DynamicMethod tryFindNextStartingP RegexNode root = _regexTree!.Root.Child(0); Label returnLabel = DefineLabel(); - if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set && (root.Options & RegexOptions.IgnoreCase) == 0) + if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set) { // If the whole expression is just one or more characters, we can rely on the FindOptimizations spitting out // an IndexOf that will find the exact sequence or not, and we don't need to do additional checking beyond that. @@ -4634,6 +4634,9 @@ private void EmitMatchCharacterClass(string charClass) // but that call is relatively expensive. Before we fall back to it, we try to optimize // some common cases for which we can do much better, such as known character classes // for which we can call a dedicated method, or a fast-path for ASCII using a lookup table. + // In some cases, multiple optimizations are possible for a given character class: the checks + // in this method are generally ordered from fastest / simplest to slowest / most complex so + // that we get the best optimization for a given char class. // First, see if the char class is a built-in one for which there's a better function // we can just call directly. @@ -4710,14 +4713,17 @@ private void EmitMatchCharacterClass(string charClass) return; } - // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and + // Next, if the character class contains nothing but Unicode categories, we can call char.GetUnicodeCategory and // compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass. - if (RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated)) + // Unlike the source generator, however, we only handle the case of a single UnicodeCategory: the source generator is able + // to rely on C# compiler optimizations to handle dealing with multiple values efficiently. + Span categories = stackalloc UnicodeCategory[1]; // handle the case of one and only one category + if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negated)) { // char.GetUnicodeCategory(ch) == category Call(s_charGetUnicodeInfo); - Ldc((int)category); + Ldc((int)categories[0]); Ceq(); if (negated) { @@ -4728,59 +4734,212 @@ private void EmitMatchCharacterClass(string charClass) return; } - // All checks after this point require reading the input character multiple times, + // Checks after this point require reading the input character multiple times, // so we store it into a temporary local. using RentedLocalBuilder tempLocal = RentInt32Local(); Stloc(tempLocal); // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), // it's cheaper and smaller to compare against each than it is to use a lookup table. - if (!RegexCharClass.IsNegated(charClass)) + Span setChars = stackalloc char[3]; + int numChars = RegexCharClass.GetSetChars(charClass, setChars); + if (numChars is 2 or 3) { - Span setChars = stackalloc char[3]; - int numChars = RegexCharClass.GetSetChars(charClass, setChars); - if (numChars is 2 or 3) + if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out int mask)) // special-case common case of an upper and lowercase ASCII letter combination { - if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out int mask)) // special-case common case of an upper and lowercase ASCII letter combination - { - // ((ch | mask) == setChars[1]) - Ldloc(tempLocal); - Ldc(mask); - Or(); - Ldc(setChars[1] | mask); - Ceq(); - } - else - { - // (ch == setChars[0]) | (ch == setChars[1]) - Ldloc(tempLocal); - Ldc(setChars[0]); - Ceq(); - Ldloc(tempLocal); - Ldc(setChars[1]); - Ceq(); - Or(); - } + // ((ch | mask) == setChars[1]) + Ldloc(tempLocal); + Ldc(mask); + Or(); + Ldc(setChars[1] | mask); + Ceq(); + } + else + { + // (ch == setChars[0]) | (ch == setChars[1]) + Ldloc(tempLocal); + Ldc(setChars[0]); + Ceq(); + Ldloc(tempLocal); + Ldc(setChars[1]); + Ceq(); + Or(); + } - // | (ch == setChars[2]) - if (numChars == 3) - { - Ldloc(tempLocal); - Ldc(setChars[2]); - Ceq(); - Or(); - } + // | (ch == setChars[2]) + if (numChars == 3) + { + Ldloc(tempLocal); + Ldc(setChars[2]); + Ceq(); + Or(); + } - return; + if (RegexCharClass.IsNegated(charClass)) + { + Ldc(0); + Ceq(); } + return; } - using RentedLocalBuilder resultLocal = RentInt32Local(); + // Next, handle simple sets of two ASCII letter ranges that are cased versions of each other, e.g. [A-Za-z]. + // This can be implemented as if it were a single range, with an additional bitwise operation. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) rangeLower, out (char LowInclusive, char HighInclusive) rangeUpper) && + RegexCharClass.IsAsciiLetter(rangeUpper.LowInclusive) && + RegexCharClass.IsAsciiLetter(rangeUpper.HighInclusive) && + (rangeLower.LowInclusive | 0x20) == rangeUpper.LowInclusive && + (rangeLower.HighInclusive | 0x20) == rangeUpper.HighInclusive) + { + Debug.Assert(rangeLower.LowInclusive != rangeUpper.LowInclusive); + bool negate = RegexCharClass.IsNegated(charClass); + + // (uint)((ch | 0x20) - lowInclusive) < highInclusive - lowInclusive + 1 + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(rangeUpper.LowInclusive); + Sub(); + Ldc(rangeUpper.HighInclusive - rangeUpper.LowInclusive + 1); + CltUn(); + if (negate) + { + Ldc(0); + Ceq(); + } + return; + } // Analyze the character set more to determine what code to generate. RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass); - // Helper method that emits a call to RegexRunner.CharInClass(ch{.ToLowerInvariant()}, charClass) + // Next, handle sets where the high - low + 1 range is <= 64. In that case, we can emit + // a branchless lookup in a ulong that does not rely on loading any objects (e.g. the string-based + // lookup we use later). This nicely handles common sets like [0-9A-Fa-f], [0-9a-f], [A-Za-z], etc. + if (analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges - analysis.LowerBoundInclusiveIfOnlyRanges) <= 64) + { + // Create the 64-bit value with 1s at indices corresponding to every character in the set, + // where the bit is computed to be the char value minus the lower bound starting from + // most significant bit downwards. + ulong bitmap = 0; + bool negatedClass = RegexCharClass.IsNegated(charClass); + for (int i = analysis.LowerBoundInclusiveIfOnlyRanges; i < analysis.UpperBoundExclusiveIfOnlyRanges; i++) + { + if (RegexCharClass.CharInClass((char)i, charClass) ^ negatedClass) + { + bitmap |= (1ul << (63 - (i - analysis.LowerBoundInclusiveIfOnlyRanges))); + } + } + + // To determine whether a character is in the set, we subtract the lowest char (casting to + // uint to account for any smaller values); this subtraction happens before the result is + // zero-extended to ulong, meaning that `charMinusLow` will always have upper 32 bits equal to 0. + // We then left shift the constant with this offset, and apply a bitmask that has the highest + // bit set (the sign bit) if and only if `chExpr` is in the [low, low + 64) range. + // Then we only need to check whether this final result is less than 0: this will only be + // the case if both `charMinusLow` was in fact the index of a set bit in the constant, and also + // `chExpr` was in the allowed range (this ensures that false positive bit shifts are ignored). + + // ulong charMinusLow = (uint)ch - lowInclusive; + LocalBuilder charMinusLow = _ilg!.DeclareLocal(typeof(ulong)); + Ldloc(tempLocal); + Ldc(analysis.LowerBoundInclusiveIfOnlyRanges); + Sub(); + _ilg!.Emit(OpCodes.Conv_U8); + Stloc(charMinusLow); + + // ulong shift = bitmap << (int)charMinusLow; + LdcI8((long)bitmap); + Ldloc(charMinusLow); + _ilg!.Emit(OpCodes.Conv_I4); + Ldc(63); + And(); + Shl(); + + // ulong mask = charMinusLow - 64; + Ldloc(charMinusLow); + Ldc(64); + _ilg!.Emit(OpCodes.Conv_I8); + Sub(); + + // (long)(shift & mask) < 0 // or >= for a negated character class + And(); + Ldc(0); + _ilg!.Emit(OpCodes.Conv_I8); + _ilg!.Emit(OpCodes.Clt); + if (negatedClass) + { + Ldc(0); + Ceq(); + } + + return; + } + + // Next, handle simple sets of two ranges, e.g. [\p{IsGreek}\p{IsGreekExtended}]. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) range0, out (char LowInclusive, char HighInclusive) range1)) + { + bool negate = RegexCharClass.IsNegated(charClass); + + if (range0.LowInclusive == range0.HighInclusive) + { + // ch == lowInclusive + Ldloc(tempLocal); + Ldc(range0.LowInclusive); + Ceq(); + } + else + { + // (uint)(ch - lowInclusive) < (uint)(highInclusive - lowInclusive + 1) + Ldloc(tempLocal); + Ldc(range0.LowInclusive); + Sub(); + Ldc(range0.HighInclusive - range0.LowInclusive + 1); + CltUn(); + } + if (negate) + { + Ldc(0); + Ceq(); + } + + if (range1.LowInclusive == range1.HighInclusive) + { + // ch == lowInclusive + Ldloc(tempLocal); + Ldc(range1.LowInclusive); + Ceq(); + } + else + { + // (uint)(ch - lowInclusive) < (uint)(highInclusive - lowInclusive + 1) + Ldloc(tempLocal); + Ldc(range1.LowInclusive); + Sub(); + Ldc(range1.HighInclusive - range1.LowInclusive + 1); + CltUn(); + } + if (negate) + { + Ldc(0); + Ceq(); + } + + if (negate) + { + And(); + } + else + { + Or(); + } + + return; + } + + using RentedLocalBuilder resultLocal = RentInt32Local(); + + // Helper method that emits a call to RegexRunner.CharInClass(ch, charClass) void EmitCharInClass() { Ldloc(tempLocal); @@ -4795,10 +4954,10 @@ void EmitCharInClass() if (analysis.ContainsNoAscii) { // We determined that the character class contains only non-ASCII, - // for example if the class were [\p{IsGreek}\p{IsGreekExtended}], which is - // the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly - // extend the analysis to produce a known lower-bound and compare against - // that rather than always using 128 as the pivot point.) + // for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000]. + // (In the future, we could possibly extend the analysis to produce a known + // lower-bound and compare against that rather than always using 128 as the + // pivot point.) // ch >= 128 && RegexRunner.CharInClass(ch, "...") Ldloc(tempLocal); @@ -4866,7 +5025,7 @@ void EmitCharInClass() // ch < 128 ? (bitVectorString[ch >> 4] & (1 << (ch & 0xF))) != 0 : Ldloc(tempLocal); - Ldc(analysis.ContainsOnlyAscii ? analysis.UpperBoundExclusiveIfContainsOnlyAscii : 128); + Ldc(analysis.ContainsOnlyAscii ? analysis.UpperBoundExclusiveIfOnlyRanges : 128); Bge(comparisonLabel); Ldstr(bitVectorString); Ldloc(tempLocal); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 132d7457d71123..82a9071a71d506 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -885,6 +885,20 @@ private RegexNode ReduceSet() RegexNodeKind.Notonelazy; } + // Normalize some well-known sets + switch (Str) + { + // Different ways of saying "match anything" + case RegexCharClass.WordNotWordClass: + case RegexCharClass.NotWordWordClass: + case RegexCharClass.DigitNotDigitClass: + case RegexCharClass.NotDigitDigitClass: + case RegexCharClass.SpaceNotSpaceClass: + case RegexCharClass.NotSpaceSpaceClass: + Str = RegexCharClass.AnyClass; + break; + } + return this; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs index 84f0e5f9d78919..9db13f6a0e37a3 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs @@ -19,6 +19,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() { yield return new object[] { engine, @"a", RegexOptions.IgnoreCase, new[] { 'a', 'A' } }; yield return new object[] { engine, @"ac", RegexOptions.None, new[] { 'a', 'c' } }; + yield return new object[] { engine, @"\u00E5\u00C5\u212B", RegexOptions.None, new[] { '\u00E5', '\u00C5', '\u212B' } }; yield return new object[] { engine, @"ace", RegexOptions.None, new[] { 'a', 'c', 'e' } }; yield return new object[] { engine, @"aceg", RegexOptions.None, new[] { 'a', 'c', 'e', 'g' } }; yield return new object[] { engine, @"aceg", RegexOptions.IgnoreCase, new[] { 'a', 'A', 'c', 'C', 'e', 'E', 'g', 'G' } }; @@ -37,6 +38,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() yield return new object[] { engine, @"a ", RegexOptions.None, new[] { 'a', ' ' } }; yield return new object[] { engine, @"a \t\r", RegexOptions.None, new[] { 'a', ' ', '\t', '\r' } }; yield return new object[] { engine, @"aeiou", RegexOptions.None, new[] { 'a', 'e', 'i', 'o', 'u' } }; + yield return new object[] { engine, @"\u0000aeiou\u00FF", RegexOptions.None, new[] { '\u0000', 'a', 'e', 'i', 'o', 'u', '\u00FF' } }; yield return new object[] { engine, @"a-a", RegexOptions.None, new[] { 'a' } }; yield return new object[] { engine, @"ab", RegexOptions.None, new[] { 'a', 'b' } }; yield return new object[] { engine, @"a-b", RegexOptions.None, new[] { 'a', 'b' } }; @@ -45,6 +47,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() yield return new object[] { engine, @"ACEGIKMOQSUWY", RegexOptions.None, new[] { 'A', 'C', 'E', 'G', 'I', 'K', 'M', 'O', 'Q', 'S', 'U', 'W', 'Y' } }; yield return new object[] { engine, @"abcAB", RegexOptions.None, new[] { 'A', 'B', 'a', 'b', 'c' } }; yield return new object[] { engine, @"a-c", RegexOptions.None, new[] { 'a', 'b', 'c' } }; + yield return new object[] { engine, @"a-fA-F", RegexOptions.None, new[] { 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F' } }; yield return new object[] { engine, @"a-fA-F0-9", RegexOptions.None, new[] { 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' } }; yield return new object[] { engine, @"X-b", RegexOptions.None, new[] { 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b' } }; yield return new object[] { engine, @"\u0083\u00DE-\u00E1", RegexOptions.None, new[] { '\u0083', '\u00DE', '\u00DF', '\u00E0', '\u00E1' } }; @@ -55,6 +58,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() yield return new object[] { engine, @"[a-z-[d-w-[m-o]]]", RegexOptions.None, new[] { 'a', 'b', 'c', 'm', 'n', 'n', 'o', 'x', 'y', 'z' } }; yield return new object[] { engine, @"\p{IsBasicLatin}-[\x00-\x7F]", RegexOptions.None, new char[0] }; yield return new object[] { engine, @"[0-9-[2468]]", RegexOptions.None, new[] { '0', '1', '3', '5', '7', '9' } }; + yield return new object[] { engine, @"[\u1000-\u1001\u3000-\u3002\u5000-\u5003]", RegexOptions.None, new[] { '\u1000', '\u1001', '\u3000', '\u3001', '\u3002', '\u5000', '\u5001', '\u5002', '\u5003' } }; } } @@ -69,8 +73,8 @@ public async Task SetInclusionsExpected(RegexEngine engine, string set, RegexOpt } else { - await ValidateSetAsync(engine, $"[{set}]", options, new HashSet(expectedIncluded), null); - await ValidateSetAsync(engine, $"[^{set}]", options, null, new HashSet(expectedIncluded)); + await ValidateSetAsync(engine, $"[{set}]", options, new HashSet(expectedIncluded), null, validateEveryChar: true); + await ValidateSetAsync(engine, $"[^{set}]", options, null, new HashSet(expectedIncluded), validateEveryChar: true); } } @@ -146,11 +150,19 @@ public async Task AllEmptySets(RegexEngine engine) await ValidateSetAsync(engine, @"[\u0000-\uFFFFa-z]", RegexOptions.None, null, set); await ValidateSetAsync(engine, @"[\u0000-\u1000\u1001-\u2002\u2003-\uFFFF]", RegexOptions.None, null, set); await ValidateSetAsync(engine, @"[\u0000-\uFFFE\u0001-\uFFFF]", RegexOptions.None, null, set, validateEveryChar: true); + foreach (string all in new[] { @"[\d\D]", @"[\D\d]", @"[\w\W]", @"[\W\w]", @"[\s\S]", @"[\S\s]", }) + { + await ValidateSetAsync(engine, all, RegexOptions.None, null, new HashSet(), validateEveryChar: true); + } await ValidateSetAsync(engine, @"[^\u0000-\uFFFF]", RegexOptions.None, set, null); await ValidateSetAsync(engine, @"[^\u0000-\uFFFFa-z]", RegexOptions.None, set, null); await ValidateSetAsync(engine, @"[^\u0000-\uFFFE\u0001-\uFFFF]", RegexOptions.None, set, null); await ValidateSetAsync(engine, @"[^\u0000-\u1000\u1001-\u2002\u2003-\uFFFF]", RegexOptions.None, set, null, validateEveryChar: true); + foreach (string empty in new[] { @"[^\d\D]", @"[^\D\d]", @"[^\w\W]", @"[^\W\w]", @"[^\s\S]", @"[^\S\s]", }) + { + await ValidateSetAsync(engine, empty, RegexOptions.None, set, null, validateEveryChar: true); + } } [Theory]