diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index c65e5187a5b210..0e9ac4f2f3031a 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -340,7 +340,7 @@ private static (bool NeedsTryFind, bool NeedsTryMatch) EmitScan(IndentedTextWrit // empty, it's helpful as a learning exposition tool. writer.WriteLine("// The pattern never matches anything."); } - else if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set && (root.Options & RegexOptions.IgnoreCase) == 0) + else if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set) { // If the whole expression is just one or more characters, we can rely on the FindOptimizations spitting out // an IndexOf that will find the exact sequence or not, and we don't need to do additional checking beyond that. @@ -3776,14 +3776,16 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s // but that call is relatively expensive. Before we fall back to it, we try to optimize // some common cases for which we can do much better, such as known character classes // for which we can call a dedicated method, or a fast-path for ASCII using a lookup table. + // In some cases, multiple optimizations are possible for a given character class: the checks + // in this method are generally ordered from fastest / simplest to slowest / most complex so + // that we get the best optimization for a given char class. // First, see if the char class is a built-in one for which there's a better function // we can just call directly. switch (charClass) { case RegexCharClass.AnyClass: - // ideally this could just be "return true;", but we need to evaluate the expression for its side effects - return $"({chExpr} {(negate ? "<" : ">=")} 0)"; // a char is unsigned and thus won't ever be negative + return negate ? "false" : "true"; // This assumes chExpr never has side effects. case RegexCharClass.DigitClass: case RegexCharClass.NotDigitClass: @@ -3811,60 +3813,127 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s $"(((uint){chExpr}) - {Literal(lowInclusive)} {(negate ? ">" : "<=")} (uint)({Literal(highInclusive)} - {Literal(lowInclusive)}))"; } - // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and + // Next, if the character class contains nothing but Unicode categories, we can call char.GetUnicodeCategory and // compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus - // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass. - if (RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated)) + // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass, + // but without the optimizations the C# compiler will provide for switches. + Span categories = stackalloc UnicodeCategory[30]; // number of UnicodeCategory values (though it's unheard of to have a set with all of them) + if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negated)) { + // TODO https://github.com/dotnet/roslyn/issues/58246: Use pattern matching instead of switch once C# code gen quality improves. negate ^= negated; - return $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} UnicodeCategory.{category})"; + return numCategories == 1 ? + $"(char.GetUnicodeCategory({chExpr}) {(negate ? "!=" : "==")} UnicodeCategory.{categories[0]})" : + $"(char.GetUnicodeCategory({chExpr}) switch {{ {string.Join(" or ", categories.Slice(0, numCategories).ToArray().Select(c => $"UnicodeCategory.{c}"))} => {(negate ? "false" : "true")}, _ => {(negate ? "true" : "false")} }})"; } // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), // it may be cheaper and smaller to compare against each than it is to use a lookup table. We can also special-case // the very common case with case insensitivity of two characters next to each other being the upper and lowercase // ASCII variants of each other, in which case we can use bit manipulation to avoid a comparison. - if (!RegexCharClass.IsNegated(charClass)) + Span setChars = stackalloc char[3]; + int mask; + switch (RegexCharClass.GetSetChars(charClass, setChars)) { - Span setChars = stackalloc char[3]; - int mask; - switch (RegexCharClass.GetSetChars(charClass, setChars)) + case 2: + negate ^= RegexCharClass.IsNegated(charClass); + if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) + { + return $"(({chExpr} | 0x{mask:X}) {(negate ? "!=" : "==")} {Literal((char)(setChars[1] | mask))})"; + } + additionalDeclarations.Add("char ch;"); + return negate ? + $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}))" : + $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; + + case 3: + negate ^= RegexCharClass.IsNegated(charClass); + additionalDeclarations.Add("char ch;"); + return (negate, RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) switch + { + (false, false) => $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))", + (true, false) => $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))", + (false, true) => $"((((ch = {chExpr}) | 0x{mask:X}) == {Literal((char)(setChars[1] | mask))}) | (ch == {Literal(setChars[2])}))", + (true, true) => $"((((ch = {chExpr}) | 0x{mask:X}) != {Literal((char)(setChars[1] | mask))}) & (ch != {Literal(setChars[2])}))", + }; + } + + // Next, handle simple sets of two ASCII letter ranges that are cased versions of each other, e.g. [A-Za-z]. + // This can be implemented as if it were a single range, with an additional bitwise operation. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) rangeLower, out (char LowInclusive, char HighInclusive) rangeUpper) && + RegexCharClass.IsAsciiLetter(rangeUpper.LowInclusive) && + RegexCharClass.IsAsciiLetter(rangeUpper.HighInclusive) && + (rangeLower.LowInclusive | 0x20) == rangeUpper.LowInclusive && + (rangeLower.HighInclusive | 0x20) == rangeUpper.HighInclusive) + { + Debug.Assert(rangeLower.LowInclusive != rangeUpper.LowInclusive); + negate ^= RegexCharClass.IsNegated(charClass); + return $"((uint)(({chExpr} | 0x20) - {Literal(rangeUpper.LowInclusive)}) {(negate ? ">" : "<=")} (uint)({Literal(rangeUpper.HighInclusive)} - {Literal(rangeUpper.LowInclusive)}))"; + } + + // Analyze the character set more to determine what code to generate. + RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass); + + // Next, handle sets where the high - low + 1 range is <= 64. In that case, we can emit + // a branchless lookup in a ulong that does not rely on loading any objects (e.g. the string-based + // lookup we use later). This nicely handles common sets like [0-9A-Fa-f], [0-9a-f], [A-Za-z], etc. + if (analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges - analysis.LowerBoundInclusiveIfOnlyRanges) <= 64) + { + additionalDeclarations.Add("ulong charMinusLow;"); + + // Create the 64-bit value with 1s at indices corresponding to every character in the set, + // where the bit is computed to be the char value minus the lower bound starting from + // most significant bit downwards. + bool negatedClass = RegexCharClass.IsNegated(charClass); + ulong bitmap = 0; + for (int i = analysis.LowerBoundInclusiveIfOnlyRanges; i < analysis.UpperBoundExclusiveIfOnlyRanges; i++) { - case 2: - if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) - { - return $"(({chExpr} | 0x{mask:X}) {(negate ? "!=" : "==")} {Literal((char)(setChars[1] | mask))})"; - } - additionalDeclarations.Add("char ch;"); - return negate ? - $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}))" : - $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))"; - - case 3: - additionalDeclarations.Add("char ch;"); - return (negate, RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out mask)) switch - { - (false, false) => $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))", - (true, false) => $"(((ch = {chExpr}) != {Literal(setChars[0])}) & (ch != {Literal(setChars[1])}) & (ch != {Literal(setChars[2])}))", - (false, true) => $"((((ch = {chExpr}) | 0x{mask:X}) == {Literal((char)(setChars[1] | mask))}) | (ch == {Literal(setChars[2])}))", - (true, true) => $"((((ch = {chExpr}) | 0x{mask:X}) != {Literal((char)(setChars[1] | mask))}) & (ch != {Literal(setChars[2])}))", - }; + if (RegexCharClass.CharInClass((char)i, charClass) ^ negatedClass) + { + bitmap |= (1ul << (63 - (i - analysis.LowerBoundInclusiveIfOnlyRanges))); + } } + + // To determine whether a character is in the set, we subtract the lowest char (casting to + // uint to account for any smaller values); this subtraction happens before the result is + // zero-extended to ulong, meaning that `charMinusLow` will always have upper 32 bits equal to 0. + // We then left shift the constant with this offset, and apply a bitmask that has the highest + // bit set (the sign bit) if and only if `chExpr` is in the [low, low + 64) range. + // Then we only need to check whether this final result is less than 0: this will only be + // the case if both `charMinusLow` was in fact the index of a set bit in the constant, and also + // `chExpr` was in the allowed range (this ensures that false positive bit shifts are ignored). + negate ^= negatedClass; + return $"((long)((0x{bitmap:X}UL << (int)(charMinusLow = (uint){chExpr} - {Literal((char)analysis.LowerBoundInclusiveIfOnlyRanges)})) & (charMinusLow - 64)) {(negate ? ">=" : "<")} 0)"; } // All options after this point require a ch local. additionalDeclarations.Add("char ch;"); - // Analyze the character set more to determine what code to generate. - RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass); + // Next, handle simple sets of two ranges, e.g. [\p{IsGreek}\p{IsGreekExtended}]. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) range0, out (char LowInclusive, char HighInclusive) range1)) + { + negate ^= RegexCharClass.IsNegated(charClass); + + string range0Clause = range0.LowInclusive == range0.HighInclusive ? + $"((ch = {chExpr}) {(negate ? "!=" : "==")} {Literal(range0.LowInclusive)})" : + $"((uint)((ch = {chExpr}) - {Literal(range0.LowInclusive)}) {(negate ? ">" : "<=")} (uint)({Literal(range0.HighInclusive)} - {Literal(range0.LowInclusive)}))"; + + string range1Clause = range1.LowInclusive == range1.HighInclusive ? + $"(ch {(negate ? "!=" : "==")} {Literal(range1.LowInclusive)})" : + $"((uint)(ch - {Literal(range1.LowInclusive)}) {(negate ? ">" : "<=")} (uint)({Literal(range1.HighInclusive)} - {Literal(range1.LowInclusive)}))"; + + return negate ? + $"({range0Clause} & {range1Clause})" : + $"({range0Clause} | {range1Clause})"; + } if (analysis.ContainsNoAscii) { // We determined that the character class contains only non-ASCII, - // for example if the class were [\p{IsGreek}\p{IsGreekExtended}], which is - // the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly - // extend the analysis to produce a known lower-bound and compare against - // that rather than always using 128 as the pivot point.) + // for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000]. + // (In the future, we could possibly extend the analysis to produce a known + // lower-bound and compare against that rather than always using 128 as the + // pivot point.) return negate ? $"((ch = {chExpr}) < 128 || !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : $"((ch = {chExpr}) >= 128 && RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; @@ -3912,19 +3981,18 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s if (analysis.ContainsOnlyAscii) { - // We know that all inputs that could match are ASCII, for example if the - // character class were [A-Za-z0-9], so since the ch is now known to be >= 128, we - // can just fail the comparison. + // If all inputs that could match are ASCII, we only need the lookup table, guarded + // by a check for the upper bound (which serves both to limit for what characters + // we need to access the lookup table and to bounds check the lookup table access). return negate ? - $"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : - $"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfContainsOnlyAscii)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; + $"((ch = {chExpr}) >= {Literal((char)analysis.UpperBoundExclusiveIfOnlyRanges)} || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : + $"((ch = {chExpr}) < {Literal((char)analysis.UpperBoundExclusiveIfOnlyRanges)} && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; } if (analysis.AllNonAsciiContained) { - // We know that all non-ASCII inputs match, for example if the character - // class were [^\r\n], so since we just determined the ch to be >= 128, we can just - // give back success. + // If every non-ASCII value is considered a match, we can immediately succeed for any + // non-ASCII inputs, and access the lookup table for the rest. return negate ? $"((ch = {chExpr}) < 128 && ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0)" : $"((ch = {chExpr}) >= 128 || ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0)"; @@ -3932,7 +4000,8 @@ private static string MatchCharacterClass(RegexOptions options, string chExpr, s // We know that the whole class wasn't ASCII, and we don't know anything about the non-ASCII // characters other than that some might be included, for example if the character class - // were [\w\d], so since ch >= 128, we need to fall back to calling CharInClass. + // were [\w\d], so if ch >= 128, we need to fall back to calling CharInClass, otherwise use + // the lookup table. return negate ? $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) == 0 : !RegexRunner.CharInClass((char)ch, {Literal(charClass)}))" : $"((ch = {chExpr}) < 128 ? ({Literal(bitVectorString)}[ch >> 4] & (1 << (ch & 0xF))) != 0 : RegexRunner.CharInClass((char)ch, {Literal(charClass)}))"; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 219d9823528651..3e341c7655e9b1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -23,7 +23,10 @@ namespace System.Text.RegularExpressions // and see where the character should go. Based on whether the ending index is odd or even, // we know if the character is in the set. // m+1...n The categories. This is a list of UnicodeCategory enum values which describe categories - // included in this class. + // included in this class. These can either be individual values (either UnicodeCategory - 1 + // for inclusive values, or -1 - UnicodeCategory for exclusive values), or a "group", which + // is a contiguous sequence of such values surrounded by \0 values; all values in the group + // have the same positive/negative orientation. /// Provides the "set of Unicode chars" functionality used by the regexp engine. internal sealed partial class RegexCharClass @@ -34,43 +37,49 @@ internal sealed partial class RegexCharClass internal const int CategoryLengthIndex = 2; internal const int SetStartIndex = 3; // must be odd for subsequent logic to work - private const string NullCharString = "\0"; - private const char NullChar = '\0'; internal const char LastChar = '\uFFFF'; internal const short SpaceConst = 100; private const short NotSpaceConst = -100; private const string InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__"; - private const string Space = "\x64"; - private const string NotSpace = "\uFF9C"; - private const string Word = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; - private const string NotWord = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; + private const string SpaceCategories = "\x64"; + private const string NotSpaceCategories = "\uFF9C"; + private const string WordCategories = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; + private const string NotWordCategories = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; internal const string SpaceClass = "\u0000\u0000\u0001\u0064"; - internal const string NotSpaceClass = "\u0001\u0000\u0001\u0064"; + internal const string NotSpaceClass = "\u0000\u0000\u0001\uFF9C"; internal const string WordClass = "\u0000\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; - internal const string NotWordClass = "\u0001\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; + internal const string NotWordClass = "\u0000\u0000\u000A\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; internal const string DigitClass = "\u0000\u0000\u0001\u0009"; internal const string NotDigitClass = "\u0000\u0000\u0001\uFFF7"; - private const string ECMASpaceSet = "\u0009\u000E\u0020\u0021"; - private const string NotECMASpaceSet = "\0\u0009\u000E\u0020\u0021"; - private const string ECMAWordSet = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; - private const string NotECMAWordSet = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; - private const string ECMADigitSet = "\u0030\u003A"; - private const string NotECMADigitSet = "\0\u0030\u003A"; + private const string ECMASpaceRanges = "\u0009\u000E\u0020\u0021"; + private const string NotECMASpaceRanges = "\0\u0009\u000E\u0020\u0021"; + private const string ECMAWordRanges = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; + private const string NotECMAWordRanges = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; + private const string ECMADigitRanges = "\u0030\u003A"; + private const string NotECMADigitRanges = "\0\u0030\u003A"; - internal const string ECMASpaceClass = "\x00\x04\x00" + ECMASpaceSet; - internal const string NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceSet; - internal const string ECMAWordClass = "\x00\x0A\x00" + ECMAWordSet; - internal const string NotECMAWordClass = "\x01\x0A\x00" + ECMAWordSet; - internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitSet; - internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet; + internal const string ECMASpaceClass = "\x00\x04\x00" + ECMASpaceRanges; + internal const string NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceRanges; + internal const string ECMAWordClass = "\x00\x0A\x00" + ECMAWordRanges; + internal const string NotECMAWordClass = "\x01\x0A\x00" + ECMAWordRanges; + internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitRanges; + internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitRanges; internal const string AnyClass = "\x00\x01\x00\x00"; private const string EmptyClass = "\x00\x00\x00"; + // Sets regularly used as a canonical way to express the equivalent of '.' with Singleline when Singleline isn't in use. + internal const string WordNotWordClass = "\u0000\u0000\u0014\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; + internal const string NotWordWordClass = "\u0000\u0000\u0014\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; + internal const string DigitNotDigitClass = "\u0000\u0000\u0002\u0009\uFFF7"; + internal const string NotDigitDigitClass = "\u0000\u0000\u0002\uFFF7\u0009"; + internal const string SpaceNotSpaceClass = "\u0000\u0000\u0002\u0064\uFF9C"; + internal const string NotSpaceSpaceClass = "\u0000\u0000\u0002\uFF9C\u0064"; + // UnicodeCategory is zero based, so we add one to each value and subtract it off later private const int DefinedCategoriesCapacity = 38; private static readonly Dictionary s_definedCategories = new Dictionary(DefinedCategoriesCapacity) @@ -93,7 +102,7 @@ internal sealed partial class RegexCharClass // InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter} // !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!! - { "__InternalRegexIgnoreCase__", "\u0000\u0002\u0003\u0001\u0000" }, + { InternalRegexIgnoreCase, "\u0000\u0002\u0003\u0001\u0000" }, // Marks { "Mc", "\u0007" }, // UnicodeCategory.SpacingCombiningMark + 1 @@ -347,9 +356,9 @@ private StringBuilder EnsureCategories() => _rangelist ??= new List<(char First, char Last)>(6); /// - /// Adds a set (specified by its string representation) to the class. + /// Adds ranges (specified by their range string representation) to the class. /// - private void AddSet(ReadOnlySpan set) + private void AddRanges(ReadOnlySpan set) { if (set.Length == 0) { @@ -410,7 +419,7 @@ public void AddCategoryFromName(string categoryName, bool invert, bool caseInsen } else { - AddSet(SetFromProperty(categoryName, invert, pattern, currentPos)); + AddRanges(RangesFromProperty(categoryName, invert, pattern, currentPos)); } } @@ -469,11 +478,11 @@ public void AddWord(bool ecma, bool negate) { if (ecma) { - AddSet((negate ? NotECMAWordSet : ECMAWordSet).AsSpan()); + AddRanges((negate ? NotECMAWordRanges : ECMAWordRanges).AsSpan()); } else { - AddCategory(negate ? NotWord : Word); + AddCategory(negate ? NotWordCategories : WordCategories); } } @@ -481,11 +490,11 @@ public void AddSpace(bool ecma, bool negate) { if (ecma) { - AddSet((negate ? NotECMASpaceSet : ECMASpaceSet).AsSpan()); + AddRanges((negate ? NotECMASpaceRanges : ECMASpaceRanges).AsSpan()); } else { - AddCategory(negate ? NotSpace : Space); + AddCategory(negate ? NotSpaceCategories : SpaceCategories); } } @@ -493,7 +502,7 @@ public void AddDigit(bool ecma, bool negate, string pattern, int currentPos) { if (ecma) { - AddSet((negate ? NotECMADigitSet : ECMADigitSet).AsSpan()); + AddRanges((negate ? NotECMADigitRanges : ECMADigitRanges).AsSpan()); } else { @@ -581,42 +590,118 @@ public static bool IsSingletonInverse(string set) => !IsSubtraction(set) && (set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]); - /// Gets whether the set contains nothing other than a single UnicodeCategory (it may be negated). - /// The set to examine. - /// The single category if there was one. - /// true if the single category is a not match. - /// true if a single category could be obtained; otherwise, false. - public static bool TryGetSingleUnicodeCategory(string set, out UnicodeCategory category, out bool negated) + /// + /// Gets the categories from a set if the set is only categories (no ranges, no subtraction), + /// they all share the same negation status (not doing so is rare), and they all fit in the destination span. + /// + /// The character class to examine. + /// The destination span into which the categories should be written. + /// The number of categories written to . + /// false if the categories written to represent inclusions; true if they represent exclusions. + /// true if the categories could be retrieved; otherwise, false. + public static bool TryGetOnlyCategories(string set, Span categories, out int numCategories, out bool negated) { - if (set[CategoryLengthIndex] == 1 && - set[SetLengthIndex] == 0 && - !IsSubtraction(set)) + negated = false; + numCategories = 0; + bool sawFirstCategory = false; + + // Require that the character class has no ranges, has no subtraction, and has categories. + int categoryLength = set[CategoryLengthIndex]; + if (categoryLength == 0 || set[SetLengthIndex] != 0 || IsSubtraction(set)) { - short c = (short)set[SetStartIndex]; + return false; + } + // Loop through all categories, storing them into the categories span. + int categoryEnd = SetStartIndex + set[CategoryLengthIndex]; + for (int pos = SetStartIndex; pos < categoryEnd; pos++) + { + // Get the next category value. + short c = (short)set[pos]; if (c > 0) { - if (c != SpaceConst) + // It's a positive (inclusive) value. Make sure all previous categories seen are also positive. + // Also make sure it's not the fake space category, which consumers don't handle as it's + // not a real UnicodeCategory. + if ((sawFirstCategory && negated) || + c == SpaceConst || + numCategories == categories.Length) { - category = (UnicodeCategory)(c - 1); - negated = IsNegated(set); - return true; + return false; } + + sawFirstCategory = true; + categories[numCategories++] = (UnicodeCategory)(c - 1); } else if (c < 0) { - if (c != NotSpaceConst) + // It's a negative (exclusive) value. Make sure all previous categories seen are also negative. + // Also make sure it's not the fake non-space category, which consumers don't handle as it's + // not a real UnicodeCategory. + if ((sawFirstCategory && !negated) || + c == NotSpaceConst || + numCategories == categories.Length) { - category = (UnicodeCategory)(-1 - c); - negated = !IsNegated(set); - return true; + return false; + } + + sawFirstCategory = true; + negated = true; + categories[numCategories++] = (UnicodeCategory)(-1 - c); + } + else // c == 0 + { + // It's the start of a group. Every value in the group needs to have the same orientation. + // We stop when we hit the next 0. + c = (short)set[++pos]; + Debug.Assert(c != 0); + if (c > 0) + { + if (sawFirstCategory && negated) + { + return false; + } + sawFirstCategory = true; + + do + { + if (numCategories == categories.Length) + { + return false; + } + + categories[numCategories++] = (UnicodeCategory)(c - 1); + c = (short)set[++pos]; + } + while (c != 0); + } + else + { + if (sawFirstCategory && !negated) + { + return false; + } + negated = true; + sawFirstCategory = true; + + do + { + if (numCategories == categories.Length) + { + return false; + } + + categories[numCategories++] = (UnicodeCategory)(-1 - c); + c = (short)set[++pos]; + } + while (c != 0); } } } - category = default; - negated = false; - return false; + // Factor in whether the entire character class is itself negated. + negated ^= IsNegated(set); + return true; } /// Attempts to get a single range stored in the set. @@ -652,6 +737,32 @@ public static bool TryGetSingleRange(string set, out char lowInclusive, out char return false; } + /// Attempts to get two ranges stored in the set. The set may be negated. + /// The set. + /// The first result range. + /// The second result range. + /// true if the set contained exactly two ranges; otherwise, false. + public static bool TryGetDoubleRange( + string set, + out (char LowInclusive, char HighInclusive) range0, + out (char LowInclusive, char HighInclusive) range1) + { + if (set[CategoryLengthIndex] == 0 && // must not have any categories + set.Length == SetStartIndex + set[SetLengthIndex]) // and no subtraction + { + int setLength = set[SetLengthIndex]; + if (setLength is 3 or 4) + { + range0 = (set[SetStartIndex], (char)(set[SetStartIndex + 1] - 1)); + range1 = (set[SetStartIndex + 2], setLength == 3 ? LastChar : (char)(set[SetStartIndex + 3] - 1)); + return true; + } + } + + range0 = range1 = ('\0', '\0'); + return false; + } + /// Gets all of the characters in the specified set, storing them into the provided span. /// The character class. /// The span into which the chars should be stored. @@ -853,6 +964,10 @@ public static bool IsAscii(ReadOnlySpan s) // TODO https://github.com/dotn return true; } + /// Gets whether the specified character is an ASCII letter. + public static bool IsAsciiLetter(char c) => // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available + (uint)((c | 0x20) - 'a') <= 'z' - 'a'; + /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents. /// This may enumerate negated characters if the set is negated. private static bool CanEasilyEnumerateSetContents(string set) => @@ -865,16 +980,26 @@ private static bool CanEasilyEnumerateSetContents(string set) => /// Provides results from . internal struct CharClassAnalysisResults { + /// true if the set contains only ranges; false if it contains Unicode categories and/or subtraction. + public bool OnlyRanges; /// true if we know for sure that the set contains only ASCII values; otherwise, false. + /// This can only be true if is true. public bool ContainsOnlyAscii; /// true if we know for sure that the set doesn't contain any ASCII values; otherwise, false. + /// This can only be true if is true. public bool ContainsNoAscii; /// true if we know for sure that all ASCII values are in the set; otherwise, false. + /// This can only be true if is true. public bool AllAsciiContained; /// true if we know for sure that all non-ASCII values are in the set; otherwise, false. + /// This can only be true if is true. public bool AllNonAsciiContained; - /// The exclusive upper bound. Only valid if is true. - public int UpperBoundExclusiveIfContainsOnlyAscii; + /// The inclusive lower bound. + /// This is only valid if is true. + public int LowerBoundInclusiveIfOnlyRanges; + /// The exclusive upper bound. + /// This is only valid if is true. + public int UpperBoundExclusiveIfOnlyRanges; } /// Analyzes the set to determine some basic properties that can be used to optimize usage. @@ -901,10 +1026,13 @@ internal static CharClassAnalysisResults Analyze(string set) // everything ASCII is included. return new CharClassAnalysisResults { + OnlyRanges = true, AllNonAsciiContained = set[set.Length - 1] < 128, AllAsciiContained = set[SetStartIndex] >= 128, ContainsNoAscii = false, - ContainsOnlyAscii = false + ContainsOnlyAscii = false, + LowerBoundInclusiveIfOnlyRanges = set[SetStartIndex], + UpperBoundExclusiveIfOnlyRanges = set[set.Length - 1], }; } @@ -912,11 +1040,13 @@ internal static CharClassAnalysisResults Analyze(string set) // Similarly if the lower bound is non-ASCII, that means no ASCII is in the class. return new CharClassAnalysisResults { + OnlyRanges = true, AllNonAsciiContained = false, AllAsciiContained = false, ContainsOnlyAscii = set[set.Length - 1] <= 128, ContainsNoAscii = set[SetStartIndex] >= 128, - UpperBoundExclusiveIfContainsOnlyAscii = set[set.Length - 1], + LowerBoundInclusiveIfOnlyRanges = set[SetStartIndex], + UpperBoundExclusiveIfOnlyRanges = set[set.Length - 1], }; } @@ -1220,34 +1350,30 @@ private static bool CharInCategoryGroup(UnicodeCategory chcategory, string categ { int pos = i + 1; int curcat = (short)category[pos]; - bool result; if (curcat > 0) { // positive case - the character must be in ANY of the categories in the group result = false; - for (; curcat != 0; curcat = (short)category[pos]) + do { - pos++; - if (!result && chcategory == (UnicodeCategory)(curcat - 1)) - { - result = true; - } + result |= chcategory == (UnicodeCategory)(curcat - 1); + curcat = (short)category[++pos]; } + while (curcat != 0); } else { // negative case - the character must be in NONE of the categories in the group + Debug.Assert(curcat < 0); result = true; - for (; curcat != 0; curcat = (short)category[pos]) + do { - pos++; - if (result && chcategory == (UnicodeCategory)(-1 - curcat)) - { - result = false; - } + result &= chcategory != (UnicodeCategory)(-1 - curcat); + curcat = (short)category[++pos]; } + while (curcat != 0); } i = pos; @@ -1559,7 +1685,7 @@ _subtractor is null && } } - private static ReadOnlySpan SetFromProperty(string capname, bool invert, string pattern, int currentPos) + private static ReadOnlySpan RangesFromProperty(string capname, bool invert, string pattern, int currentPos) { int min = 0; int max = s_propTable.Length; @@ -1581,8 +1707,8 @@ private static ReadOnlySpan SetFromProperty(string capname, bool invert, s Debug.Assert(!string.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table"); return !invert ? set.AsSpan() : - set[0] == NullChar ? set.AsSpan(1) : - (NullCharString + set).AsSpan(); + set[0] == '\0' ? set.AsSpan(1) : + ("\0" + set).AsSpan(); } } @@ -1671,11 +1797,11 @@ public static string DescribeSet(string set) if (!found) { - if (group.Equals(Word)) + if (group.Equals(WordCategories)) { desc.Append("\\w"); } - else if (group.Equals(NotWord)) + else if (group.Equals(NotWordCategories)) { desc.Append("\\W"); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index e6d1e47fb6b9f6..2fa72def6f27fe 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -4508,7 +4508,7 @@ protected void EmitScan(RegexOptions options, DynamicMethod tryFindNextStartingP RegexNode root = _regexTree!.Root.Child(0); Label returnLabel = DefineLabel(); - if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set && (root.Options & RegexOptions.IgnoreCase) == 0) + if (root.Kind is RegexNodeKind.Multi or RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set) { // If the whole expression is just one or more characters, we can rely on the FindOptimizations spitting out // an IndexOf that will find the exact sequence or not, and we don't need to do additional checking beyond that. @@ -4634,6 +4634,9 @@ private void EmitMatchCharacterClass(string charClass) // but that call is relatively expensive. Before we fall back to it, we try to optimize // some common cases for which we can do much better, such as known character classes // for which we can call a dedicated method, or a fast-path for ASCII using a lookup table. + // In some cases, multiple optimizations are possible for a given character class: the checks + // in this method are generally ordered from fastest / simplest to slowest / most complex so + // that we get the best optimization for a given char class. // First, see if the char class is a built-in one for which there's a better function // we can just call directly. @@ -4710,14 +4713,17 @@ private void EmitMatchCharacterClass(string charClass) return; } - // Next if the character class contains nothing but a single Unicode category, we can calle char.GetUnicodeCategory and + // Next, if the character class contains nothing but Unicode categories, we can call char.GetUnicodeCategory and // compare against it. It has a fast-lookup path for ASCII, so is as good or better than any lookup we'd generate (plus // we get smaller code), and it's what we'd do for the fallback (which we get to avoid generating) as part of CharInClass. - if (RegexCharClass.TryGetSingleUnicodeCategory(charClass, out UnicodeCategory category, out bool negated)) + // Unlike the source generator, however, we only handle the case of a single UnicodeCategory: the source generator is able + // to rely on C# compiler optimizations to handle dealing with multiple values efficiently. + Span categories = stackalloc UnicodeCategory[1]; // handle the case of one and only one category + if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negated)) { // char.GetUnicodeCategory(ch) == category Call(s_charGetUnicodeInfo); - Ldc((int)category); + Ldc((int)categories[0]); Ceq(); if (negated) { @@ -4728,59 +4734,212 @@ private void EmitMatchCharacterClass(string charClass) return; } - // All checks after this point require reading the input character multiple times, + // Checks after this point require reading the input character multiple times, // so we store it into a temporary local. using RentedLocalBuilder tempLocal = RentInt32Local(); Stloc(tempLocal); // Next, if there's only 2 or 3 chars in the set (fairly common due to the sets we create for prefixes), // it's cheaper and smaller to compare against each than it is to use a lookup table. - if (!RegexCharClass.IsNegated(charClass)) + Span setChars = stackalloc char[3]; + int numChars = RegexCharClass.GetSetChars(charClass, setChars); + if (numChars is 2 or 3) { - Span setChars = stackalloc char[3]; - int numChars = RegexCharClass.GetSetChars(charClass, setChars); - if (numChars is 2 or 3) + if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out int mask)) // special-case common case of an upper and lowercase ASCII letter combination { - if (RegexCharClass.DifferByOneBit(setChars[0], setChars[1], out int mask)) // special-case common case of an upper and lowercase ASCII letter combination - { - // ((ch | mask) == setChars[1]) - Ldloc(tempLocal); - Ldc(mask); - Or(); - Ldc(setChars[1] | mask); - Ceq(); - } - else - { - // (ch == setChars[0]) | (ch == setChars[1]) - Ldloc(tempLocal); - Ldc(setChars[0]); - Ceq(); - Ldloc(tempLocal); - Ldc(setChars[1]); - Ceq(); - Or(); - } + // ((ch | mask) == setChars[1]) + Ldloc(tempLocal); + Ldc(mask); + Or(); + Ldc(setChars[1] | mask); + Ceq(); + } + else + { + // (ch == setChars[0]) | (ch == setChars[1]) + Ldloc(tempLocal); + Ldc(setChars[0]); + Ceq(); + Ldloc(tempLocal); + Ldc(setChars[1]); + Ceq(); + Or(); + } - // | (ch == setChars[2]) - if (numChars == 3) - { - Ldloc(tempLocal); - Ldc(setChars[2]); - Ceq(); - Or(); - } + // | (ch == setChars[2]) + if (numChars == 3) + { + Ldloc(tempLocal); + Ldc(setChars[2]); + Ceq(); + Or(); + } - return; + if (RegexCharClass.IsNegated(charClass)) + { + Ldc(0); + Ceq(); } + return; } - using RentedLocalBuilder resultLocal = RentInt32Local(); + // Next, handle simple sets of two ASCII letter ranges that are cased versions of each other, e.g. [A-Za-z]. + // This can be implemented as if it were a single range, with an additional bitwise operation. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) rangeLower, out (char LowInclusive, char HighInclusive) rangeUpper) && + RegexCharClass.IsAsciiLetter(rangeUpper.LowInclusive) && + RegexCharClass.IsAsciiLetter(rangeUpper.HighInclusive) && + (rangeLower.LowInclusive | 0x20) == rangeUpper.LowInclusive && + (rangeLower.HighInclusive | 0x20) == rangeUpper.HighInclusive) + { + Debug.Assert(rangeLower.LowInclusive != rangeUpper.LowInclusive); + bool negate = RegexCharClass.IsNegated(charClass); + + // (uint)((ch | 0x20) - lowInclusive) < highInclusive - lowInclusive + 1 + Ldloc(tempLocal); + Ldc(0x20); + Or(); + Ldc(rangeUpper.LowInclusive); + Sub(); + Ldc(rangeUpper.HighInclusive - rangeUpper.LowInclusive + 1); + CltUn(); + if (negate) + { + Ldc(0); + Ceq(); + } + return; + } // Analyze the character set more to determine what code to generate. RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass); - // Helper method that emits a call to RegexRunner.CharInClass(ch{.ToLowerInvariant()}, charClass) + // Next, handle sets where the high - low + 1 range is <= 64. In that case, we can emit + // a branchless lookup in a ulong that does not rely on loading any objects (e.g. the string-based + // lookup we use later). This nicely handles common sets like [0-9A-Fa-f], [0-9a-f], [A-Za-z], etc. + if (analysis.OnlyRanges && (analysis.UpperBoundExclusiveIfOnlyRanges - analysis.LowerBoundInclusiveIfOnlyRanges) <= 64) + { + // Create the 64-bit value with 1s at indices corresponding to every character in the set, + // where the bit is computed to be the char value minus the lower bound starting from + // most significant bit downwards. + ulong bitmap = 0; + bool negatedClass = RegexCharClass.IsNegated(charClass); + for (int i = analysis.LowerBoundInclusiveIfOnlyRanges; i < analysis.UpperBoundExclusiveIfOnlyRanges; i++) + { + if (RegexCharClass.CharInClass((char)i, charClass) ^ negatedClass) + { + bitmap |= (1ul << (63 - (i - analysis.LowerBoundInclusiveIfOnlyRanges))); + } + } + + // To determine whether a character is in the set, we subtract the lowest char (casting to + // uint to account for any smaller values); this subtraction happens before the result is + // zero-extended to ulong, meaning that `charMinusLow` will always have upper 32 bits equal to 0. + // We then left shift the constant with this offset, and apply a bitmask that has the highest + // bit set (the sign bit) if and only if `chExpr` is in the [low, low + 64) range. + // Then we only need to check whether this final result is less than 0: this will only be + // the case if both `charMinusLow` was in fact the index of a set bit in the constant, and also + // `chExpr` was in the allowed range (this ensures that false positive bit shifts are ignored). + + // ulong charMinusLow = (uint)ch - lowInclusive; + LocalBuilder charMinusLow = _ilg!.DeclareLocal(typeof(ulong)); + Ldloc(tempLocal); + Ldc(analysis.LowerBoundInclusiveIfOnlyRanges); + Sub(); + _ilg!.Emit(OpCodes.Conv_U8); + Stloc(charMinusLow); + + // ulong shift = bitmap << (int)charMinusLow; + LdcI8((long)bitmap); + Ldloc(charMinusLow); + _ilg!.Emit(OpCodes.Conv_I4); + Ldc(63); + And(); + Shl(); + + // ulong mask = charMinusLow - 64; + Ldloc(charMinusLow); + Ldc(64); + _ilg!.Emit(OpCodes.Conv_I8); + Sub(); + + // (long)(shift & mask) < 0 // or >= for a negated character class + And(); + Ldc(0); + _ilg!.Emit(OpCodes.Conv_I8); + _ilg!.Emit(OpCodes.Clt); + if (negatedClass) + { + Ldc(0); + Ceq(); + } + + return; + } + + // Next, handle simple sets of two ranges, e.g. [\p{IsGreek}\p{IsGreekExtended}]. + if (RegexCharClass.TryGetDoubleRange(charClass, out (char LowInclusive, char HighInclusive) range0, out (char LowInclusive, char HighInclusive) range1)) + { + bool negate = RegexCharClass.IsNegated(charClass); + + if (range0.LowInclusive == range0.HighInclusive) + { + // ch == lowInclusive + Ldloc(tempLocal); + Ldc(range0.LowInclusive); + Ceq(); + } + else + { + // (uint)(ch - lowInclusive) < (uint)(highInclusive - lowInclusive + 1) + Ldloc(tempLocal); + Ldc(range0.LowInclusive); + Sub(); + Ldc(range0.HighInclusive - range0.LowInclusive + 1); + CltUn(); + } + if (negate) + { + Ldc(0); + Ceq(); + } + + if (range1.LowInclusive == range1.HighInclusive) + { + // ch == lowInclusive + Ldloc(tempLocal); + Ldc(range1.LowInclusive); + Ceq(); + } + else + { + // (uint)(ch - lowInclusive) < (uint)(highInclusive - lowInclusive + 1) + Ldloc(tempLocal); + Ldc(range1.LowInclusive); + Sub(); + Ldc(range1.HighInclusive - range1.LowInclusive + 1); + CltUn(); + } + if (negate) + { + Ldc(0); + Ceq(); + } + + if (negate) + { + And(); + } + else + { + Or(); + } + + return; + } + + using RentedLocalBuilder resultLocal = RentInt32Local(); + + // Helper method that emits a call to RegexRunner.CharInClass(ch, charClass) void EmitCharInClass() { Ldloc(tempLocal); @@ -4795,10 +4954,10 @@ void EmitCharInClass() if (analysis.ContainsNoAscii) { // We determined that the character class contains only non-ASCII, - // for example if the class were [\p{IsGreek}\p{IsGreekExtended}], which is - // the same as [\u0370-\u03FF\u1F00-1FFF]. (In the future, we could possibly - // extend the analysis to produce a known lower-bound and compare against - // that rather than always using 128 as the pivot point.) + // for example if the class were [\u1000-\u2000\u3000-\u4000\u5000-\u6000]. + // (In the future, we could possibly extend the analysis to produce a known + // lower-bound and compare against that rather than always using 128 as the + // pivot point.) // ch >= 128 && RegexRunner.CharInClass(ch, "...") Ldloc(tempLocal); @@ -4866,7 +5025,7 @@ void EmitCharInClass() // ch < 128 ? (bitVectorString[ch >> 4] & (1 << (ch & 0xF))) != 0 : Ldloc(tempLocal); - Ldc(analysis.ContainsOnlyAscii ? analysis.UpperBoundExclusiveIfContainsOnlyAscii : 128); + Ldc(analysis.ContainsOnlyAscii ? analysis.UpperBoundExclusiveIfOnlyRanges : 128); Bge(comparisonLabel); Ldstr(bitVectorString); Ldloc(tempLocal); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 132d7457d71123..82a9071a71d506 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -885,6 +885,20 @@ private RegexNode ReduceSet() RegexNodeKind.Notonelazy; } + // Normalize some well-known sets + switch (Str) + { + // Different ways of saying "match anything" + case RegexCharClass.WordNotWordClass: + case RegexCharClass.NotWordWordClass: + case RegexCharClass.DigitNotDigitClass: + case RegexCharClass.NotDigitDigitClass: + case RegexCharClass.SpaceNotSpaceClass: + case RegexCharClass.NotSpaceSpaceClass: + Str = RegexCharClass.AnyClass; + break; + } + return this; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs index 84f0e5f9d78919..9db13f6a0e37a3 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexCharacterSetTests.cs @@ -19,6 +19,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() { yield return new object[] { engine, @"a", RegexOptions.IgnoreCase, new[] { 'a', 'A' } }; yield return new object[] { engine, @"ac", RegexOptions.None, new[] { 'a', 'c' } }; + yield return new object[] { engine, @"\u00E5\u00C5\u212B", RegexOptions.None, new[] { '\u00E5', '\u00C5', '\u212B' } }; yield return new object[] { engine, @"ace", RegexOptions.None, new[] { 'a', 'c', 'e' } }; yield return new object[] { engine, @"aceg", RegexOptions.None, new[] { 'a', 'c', 'e', 'g' } }; yield return new object[] { engine, @"aceg", RegexOptions.IgnoreCase, new[] { 'a', 'A', 'c', 'C', 'e', 'E', 'g', 'G' } }; @@ -37,6 +38,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() yield return new object[] { engine, @"a ", RegexOptions.None, new[] { 'a', ' ' } }; yield return new object[] { engine, @"a \t\r", RegexOptions.None, new[] { 'a', ' ', '\t', '\r' } }; yield return new object[] { engine, @"aeiou", RegexOptions.None, new[] { 'a', 'e', 'i', 'o', 'u' } }; + yield return new object[] { engine, @"\u0000aeiou\u00FF", RegexOptions.None, new[] { '\u0000', 'a', 'e', 'i', 'o', 'u', '\u00FF' } }; yield return new object[] { engine, @"a-a", RegexOptions.None, new[] { 'a' } }; yield return new object[] { engine, @"ab", RegexOptions.None, new[] { 'a', 'b' } }; yield return new object[] { engine, @"a-b", RegexOptions.None, new[] { 'a', 'b' } }; @@ -45,6 +47,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() yield return new object[] { engine, @"ACEGIKMOQSUWY", RegexOptions.None, new[] { 'A', 'C', 'E', 'G', 'I', 'K', 'M', 'O', 'Q', 'S', 'U', 'W', 'Y' } }; yield return new object[] { engine, @"abcAB", RegexOptions.None, new[] { 'A', 'B', 'a', 'b', 'c' } }; yield return new object[] { engine, @"a-c", RegexOptions.None, new[] { 'a', 'b', 'c' } }; + yield return new object[] { engine, @"a-fA-F", RegexOptions.None, new[] { 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F' } }; yield return new object[] { engine, @"a-fA-F0-9", RegexOptions.None, new[] { 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' } }; yield return new object[] { engine, @"X-b", RegexOptions.None, new[] { 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b' } }; yield return new object[] { engine, @"\u0083\u00DE-\u00E1", RegexOptions.None, new[] { '\u0083', '\u00DE', '\u00DF', '\u00E0', '\u00E1' } }; @@ -55,6 +58,7 @@ public static IEnumerable SetInclusionsExpected_MemberData() yield return new object[] { engine, @"[a-z-[d-w-[m-o]]]", RegexOptions.None, new[] { 'a', 'b', 'c', 'm', 'n', 'n', 'o', 'x', 'y', 'z' } }; yield return new object[] { engine, @"\p{IsBasicLatin}-[\x00-\x7F]", RegexOptions.None, new char[0] }; yield return new object[] { engine, @"[0-9-[2468]]", RegexOptions.None, new[] { '0', '1', '3', '5', '7', '9' } }; + yield return new object[] { engine, @"[\u1000-\u1001\u3000-\u3002\u5000-\u5003]", RegexOptions.None, new[] { '\u1000', '\u1001', '\u3000', '\u3001', '\u3002', '\u5000', '\u5001', '\u5002', '\u5003' } }; } } @@ -69,8 +73,8 @@ public async Task SetInclusionsExpected(RegexEngine engine, string set, RegexOpt } else { - await ValidateSetAsync(engine, $"[{set}]", options, new HashSet(expectedIncluded), null); - await ValidateSetAsync(engine, $"[^{set}]", options, null, new HashSet(expectedIncluded)); + await ValidateSetAsync(engine, $"[{set}]", options, new HashSet(expectedIncluded), null, validateEveryChar: true); + await ValidateSetAsync(engine, $"[^{set}]", options, null, new HashSet(expectedIncluded), validateEveryChar: true); } } @@ -146,11 +150,19 @@ public async Task AllEmptySets(RegexEngine engine) await ValidateSetAsync(engine, @"[\u0000-\uFFFFa-z]", RegexOptions.None, null, set); await ValidateSetAsync(engine, @"[\u0000-\u1000\u1001-\u2002\u2003-\uFFFF]", RegexOptions.None, null, set); await ValidateSetAsync(engine, @"[\u0000-\uFFFE\u0001-\uFFFF]", RegexOptions.None, null, set, validateEveryChar: true); + foreach (string all in new[] { @"[\d\D]", @"[\D\d]", @"[\w\W]", @"[\W\w]", @"[\s\S]", @"[\S\s]", }) + { + await ValidateSetAsync(engine, all, RegexOptions.None, null, new HashSet(), validateEveryChar: true); + } await ValidateSetAsync(engine, @"[^\u0000-\uFFFF]", RegexOptions.None, set, null); await ValidateSetAsync(engine, @"[^\u0000-\uFFFFa-z]", RegexOptions.None, set, null); await ValidateSetAsync(engine, @"[^\u0000-\uFFFE\u0001-\uFFFF]", RegexOptions.None, set, null); await ValidateSetAsync(engine, @"[^\u0000-\u1000\u1001-\u2002\u2003-\uFFFF]", RegexOptions.None, set, null, validateEveryChar: true); + foreach (string empty in new[] { @"[^\d\D]", @"[^\D\d]", @"[^\w\W]", @"[^\W\w]", @"[^\s\S]", @"[^\S\s]", }) + { + await ValidateSetAsync(engine, empty, RegexOptions.None, set, null, validateEveryChar: true); + } } [Theory]