diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 5313bf0d948138..77351ca146f7ae 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -7,6 +7,7 @@ + @@ -46,14 +47,10 @@ - - - - + + + + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.MappingTable.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.MappingTable.cs new file mode 100644 index 00000000000000..b299a3b8e1e8b9 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.MappingTable.cs @@ -0,0 +1,162 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; + +namespace System.Text.RegularExpressions +{ + internal sealed partial class RegexCharClass + { + /************************************************************************** + Let U be the set of Unicode character values and let L be the lowercase + function, mapping from U to U. To perform case insensitive matching of + character sets, we need to be able to map an interval I in U, say + + I = [chMin, chMax] = { ch : chMin <= ch <= chMax } + + to a set A such that A contains L(I) and A is contained in the union of + I and L(I). + + The table below partitions U into intervals on which L is non-decreasing. + Thus, for any interval J = [a, b] contained in one of these intervals, + L(J) is contained in [L(a), L(b)]. + + It is also true that for any such J, [L(a), L(b)] is contained in the + union of J and L(J). This does not follow from L being non-decreasing on + these intervals. It follows from the nature of the L on each interval. + On each interval, L has one of the following forms: + + (1) L(ch) = constant (LowercaseSet) + (2) L(ch) = ch + offset (LowercaseAdd) + (3) L(ch) = ch | 1 (LowercaseBor) + (4) L(ch) = ch + (ch & 1) (LowercaseBad) + + It is easy to verify that for any of these forms [L(a), L(b)] is + contained in the union of [a, b] and L([a, b]). + ***************************************************************************/ + + internal const int LowercaseSet = 0; // Set to arg. + internal const int LowercaseAdd = 1; // Add arg. + internal const int LowercaseBor = 2; // Bitwise or with 1. + internal const int LowercaseBad = 3; // Bitwise and with 1 and add original. + + internal static readonly LowerCaseMapping[] s_lcTable = new LowerCaseMapping[] + { + new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32), + new LowerCaseMapping('\u00C0', '\u00D6', LowercaseAdd, 32), + new LowerCaseMapping('\u00D8', '\u00DE', LowercaseAdd, 32), + new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0), + new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0), + new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0), + new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0), + new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF), + new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0), + new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253), + new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0), + new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254), + new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188), + new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205), + new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C), + new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD), + new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259), + new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B), + new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192), + new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260), + new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263), + new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269), + new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268), + new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199), + new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F), + new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272), + new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275), + new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0), + new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8), + new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283), + new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD), + new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288), + new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0), + new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217), + new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0), + new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292), + new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9), + new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD), + new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6), + new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9), + new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC), + new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0), + new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0), + new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3), + new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5), + new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0), + new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC), + new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37), + new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC), + new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63), + new LowerCaseMapping('\u0391', '\u03A1', LowercaseAdd, 32), + new LowerCaseMapping('\u03A3', '\u03AB', LowercaseAdd, 32), + new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0), + new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80), + new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32), + new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0), + new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0), + new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0), + new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8), + new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC), + new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0), + new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0), + new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9), + new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48), + new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 7264), + new LowerCaseMapping('\u1E00', '\u1E95', LowercaseBor, 0), + new LowerCaseMapping('\u1EA0', '\u1EF8', LowercaseBor, 0), + new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8), + new LowerCaseMapping('\u1F18', '\u1F1D', LowercaseAdd, -8), + new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8), + new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8), + new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8), + new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51), + new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53), + new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55), + new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57), + new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8), + new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8), + new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8), + new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8), + new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8), + new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74), + new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3), + new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86), + new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3), + new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8), + new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100), + new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8), + new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112), + new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5), + new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128), + new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126), + new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3), + new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16), + new LowerCaseMapping('\u24B6', '\u24CF', LowercaseAdd, 26), + new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32), + }; + + /// + /// Lower case mapping descriptor. + /// + internal readonly struct LowerCaseMapping + { + public readonly char ChMin; + public readonly char ChMax; + public readonly int LcOp; + public readonly int Data; + + internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data) + { + ChMin = chMin; + ChMax = chMax; + LcOp = lcOp; + Data = data; + } + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index d4022f673e4ff7..3dc776fe604711 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -25,7 +25,7 @@ namespace System.Text.RegularExpressions // included in this class. /// Provides the "set of Unicode chars" functionality used by the regexp engine. - internal sealed class RegexCharClass + internal sealed partial class RegexCharClass { // Constants internal const int FlagsIndex = 0; @@ -264,137 +264,6 @@ internal sealed class RegexCharClass +"\u3041\u3097\u3099\u30A0\u30A1\u30FB\u30FC\u3100\u3105\u312D\u3131\u318F\u3190\u31B8\u31F0\u321D\u3220\u3244\u3251\u327C\u327F\u32CC\u32D0\u32FF\u3300\u3377\u337B\u33DE\u33E0\u33FF\u3400\u4DB6\u4E00\u9FA6\uA000\uA48D\uA490\uA4C7\uAC00\uD7A4\uF900\uFA2E\uFA30\uFA6B\uFB00\uFB07\uFB13\uFB18\uFB1D\uFB37\uFB38\uFB3D\uFB3E\uFB3F\uFB40\uFB42\uFB43\uFB45\uFB46\uFBB2\uFBD3\uFD3E\uFD50\uFD90\uFD92\uFDC8\uFDF0\uFDFD\uFE00\uFE10\uFE20\uFE24\uFE62\uFE63\uFE64\uFE67\uFE69\uFE6A\uFE70\uFE75\uFE76\uFEFD\uFF04\uFF05\uFF0B\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF21\uFF3B\uFF3E\uFF3F\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF66\uFFBF\uFFC2\uFFC8\uFFCA\uFFD0\uFFD2\uFFD8\uFFDA\uFFDD\uFFE0\uFFE7\uFFE8\uFFEF\uFFFC\uFFFE"}, }; - /************************************************************************** - Let U be the set of Unicode character values and let L be the lowercase - function, mapping from U to U. To perform case insensitive matching of - character sets, we need to be able to map an interval I in U, say - - I = [chMin, chMax] = { ch : chMin <= ch <= chMax } - - to a set A such that A contains L(I) and A is contained in the union of - I and L(I). - - The table below partitions U into intervals on which L is non-decreasing. - Thus, for any interval J = [a, b] contained in one of these intervals, - L(J) is contained in [L(a), L(b)]. - - It is also true that for any such J, [L(a), L(b)] is contained in the - union of J and L(J). This does not follow from L being non-decreasing on - these intervals. It follows from the nature of the L on each interval. - On each interval, L has one of the following forms: - - (1) L(ch) = constant (LowercaseSet) - (2) L(ch) = ch + offset (LowercaseAdd) - (3) L(ch) = ch | 1 (LowercaseBor) - (4) L(ch) = ch + (ch & 1) (LowercaseBad) - - It is easy to verify that for any of these forms [L(a), L(b)] is - contained in the union of [a, b] and L([a, b]). - ***************************************************************************/ - - private const int LowercaseSet = 0; // Set to arg. - private const int LowercaseAdd = 1; // Add arg. - private const int LowercaseBor = 2; // Bitwise or with 1. - private const int LowercaseBad = 3; // Bitwise and with 1 and add original. - - private static readonly LowerCaseMapping[] s_lcTable = new LowerCaseMapping[] - { - new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32), - new LowerCaseMapping('\u00C0', '\u00DE', LowercaseAdd, 32), - new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0), - new LowerCaseMapping('\u0130', '\u0130', LowercaseSet, 0x0069), - new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0), - new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0), - new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0), - new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF), - new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0), - new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253), - new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0), - new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254), - new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188), - new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205), - new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C), - new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD), - new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259), - new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B), - new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192), - new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260), - new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263), - new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269), - new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268), - new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199), - new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F), - new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272), - new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275), - new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0), - new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8), - new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283), - new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD), - new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288), - new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0), - new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217), - new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0), - new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292), - new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9), - new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD), - new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6), - new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9), - new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC), - new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0), - new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0), - new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3), - new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5), - new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0), - new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC), - new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37), - new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC), - new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63), - new LowerCaseMapping('\u0391', '\u03AB', LowercaseAdd, 32), - new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0), - new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80), - new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32), - new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0), - new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0), - new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0), - new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8), - new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC), - new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0), - new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0), - new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9), - new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48), - new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 48), - new LowerCaseMapping('\u1E00', '\u1EF8', LowercaseBor, 0), - new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8), - new LowerCaseMapping('\u1F18', '\u1F1F', LowercaseAdd, -8), - new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8), - new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8), - new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8), - new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51), - new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53), - new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55), - new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57), - new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8), - new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8), - new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8), - new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8), - new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8), - new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74), - new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3), - new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86), - new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3), - new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8), - new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100), - new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8), - new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112), - new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5), - new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128), - new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126), - new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3), - new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16), - new LowerCaseMapping('\u24B6', '\u24D0', LowercaseAdd, 26), - new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32), - }; - private List? _rangelist; private StringBuilder? _categories; private RegexCharClass? _subtractor; @@ -412,6 +281,7 @@ static RegexCharClass() int len = s_propTable.Length; for (int i = 0; i < len - 1; i++) Debug.Assert(string.Compare(s_propTable[i][0], s_propTable[i + 1][0], StringComparison.Ordinal) < 0, $"RegexCharClass s_propTable is out of order at ({s_propTable[i][0]}, {s_propTable[i + 1][0]})"); + } #endif @@ -569,7 +439,7 @@ private void AddLowercaseRange(char chMin, char chMax) { int i = 0; - for (int iMax = s_lcTable.Length; i < iMax; ) + for (int iMax = s_lcTable.Length; i < iMax;) { int iMid = (i + iMax) >> 1; if (s_lcTable[iMid].ChMax < chMin) @@ -1692,25 +1562,6 @@ private static string CategoryDescription(char ch) } #endif - /// - /// Lower case mapping descriptor. - /// - private readonly struct LowerCaseMapping - { - public readonly char ChMin; - public readonly char ChMax; - public readonly int LcOp; - public readonly int Data; - - internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data) - { - ChMin = chMin; - ChMax = chMax; - LcOp = lcOp; - Data = data; - } - } - /// /// A first/last pair representing a single range of characters. /// diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index a7e73699f36e8e..a8b319ad27deeb 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -378,6 +378,32 @@ public static IEnumerable Match_Basic_TestData() } } + public static IEnumerable Match_Basic_TestData_NetCore() + { + // Unicode symbols in character ranges. These are chars whose lowercase values cannot be found by using the offsets specified in s_lcTable. + yield return new object[] { @"^(?i:[\u00D7-\u00D8])$", '\u00F7'.ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, false, "" }; + yield return new object[] { @"^(?i:[\u00C0-\u00DE])$", '\u00F7'.ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, false, "" }; + yield return new object[] { @"^(?i:[\u00C0-\u00DE])$", ((char)('\u00C0' + 32)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u00C0' + 32)).ToString() }; + yield return new object[] { @"^(?i:[\u00C0-\u00DE])$", ((char)('\u00DE' + 32)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u00DE' + 32)).ToString() }; + yield return new object[] { @"^(?i:[\u0391-\u03AB])$", ((char)('\u03A2' + 32)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, false, "" }; + yield return new object[] { @"^(?i:[\u0391-\u03AB])$", ((char)('\u0391' + 32)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u0391' + 32)).ToString() }; + yield return new object[] { @"^(?i:[\u0391-\u03AB])$", ((char)('\u03AB' + 32)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u03AB' + 32)).ToString() }; + yield return new object[] { @"^(?i:[\u1F18-\u1F1F])$", ((char)('\u1F1F' - 8)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, false, "" }; + yield return new object[] { @"^(?i:[\u1F18-\u1F1F])$", ((char)('\u1F18' - 8)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u1F18' - 8)).ToString() }; + yield return new object[] { @"^(?i:[\u10A0-\u10C5])$", ((char)('\u10A0' + 7264)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u10A0' + 7264)).ToString() }; + yield return new object[] { @"^(?i:[\u10A0-\u10C5])$", ((char)('\u1F1F' + 48)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, false, "" }; + yield return new object[] { @"^(?i:[\u24B6-\u24D0])$", ((char)('\u24D0' + 26)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, false, "" }; + yield return new object[] { @"^(?i:[\u24B6-\u24D0])$", ((char)('\u24CF' + 26)).ToString(), RegexOptions.IgnoreCase | RegexOptions.CultureInvariant, 0, 1, true, ((char)('\u24CF' + 26)).ToString() }; + } + + [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework)] + [Theory] + [MemberData(nameof(Match_Basic_TestData_NetCore))] + public void Match_NetCore(string pattern, string input, RegexOptions options, int beginning, int length, bool expectedSuccess, string expectedValue) + { + Match(pattern, input, options, beginning, length, expectedSuccess, expectedValue); + } + [Theory] [MemberData(nameof(Match_Basic_TestData))] [MemberData(nameof(RegexCompilationHelper.TransformRegexOptions), nameof(Match_Basic_TestData), 2, MemberType = typeof(RegexCompilationHelper))] diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexValidations.netcoreapp.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexValidations.netcoreapp.cs new file mode 100644 index 00000000000000..9f4cf807d5c9db --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexValidations.netcoreapp.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Globalization; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class RegexValidations + { + [ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] + public void ValidateLowercaseMapTableInRegexCharClass() + { + CultureInfo culture = CultureInfo.InvariantCulture; + for (int k = 0; k < RegexCharClass.s_lcTable.Length; k++) + { + RegexCharClass.LowerCaseMapping loc = RegexCharClass.s_lcTable[k]; + if (loc.LcOp == RegexCharClass.LowercaseAdd) + { + int offset = loc.Data; + for (char l = loc.ChMin; l <= loc.ChMax; l++) + { + Assert.True(culture.TextInfo.ToLower((char)l) == (char)(l + offset), $"The Unicode character range at index {k} in s_lcTable contains the character {(char)l} (decimal value: {l}). Its lowercase value cannot be obtained by using the specified offset."); + } + } + else if (loc.LcOp == RegexCharClass.LowercaseSet) + { + char lowercase = (char)loc.Data; + for (char l = loc.ChMin; l <= loc.ChMax; l++) + { + char uppercase = l; + Assert.True(culture.TextInfo.ToLower(uppercase) == lowercase, $"The Unicode character range at index {k} in s_lcTable contains the character {uppercase} (decimal value: {(int)uppercase}, hex: {(int)uppercase:X}). Its lowercase value {culture.TextInfo.ToLower(uppercase).ToString()} (decimal value: {(int)culture.TextInfo.ToLower(uppercase)}, hex: {(int)culture.TextInfo.ToLower(uppercase):X}) is not the stored value {lowercase} (decimal value: {(int)lowercase}, hex: {(int)lowercase:X})."); + } + } + else if (loc.LcOp == RegexCharClass.LowercaseBor) + { + for (char l = loc.ChMin; l <= loc.ChMax; l++) + { + Assert.True(culture.TextInfo.ToLower((char)l) == (char)(l | (char)1), $"The Unicode character range at index {k} in s_lcTable contains the character {(char)l} (decimal value: {l}). Its lowercase value {culture.TextInfo.ToLower(l)} cannot be obtained by OR-ing with 1: {(char)(l | (char)1)}"); + } + } + else if (loc.LcOp == RegexCharClass.LowercaseBad) + { + for (char l = loc.ChMin; l <= loc.ChMax; l++) + { + Assert.True(culture.TextInfo.ToLower((char)l) == (char)(l + (l & 1)), $"The Unicode character range at index {k} in s_lcTable contains the character {(char)l} (decimal value: {l}). Its lowercase value cannot be obtained by AND-ing with 1."); + } + } + } + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj index fade03bcb99e88..45fe74f6287690 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj @@ -32,12 +32,12 @@ - + + @@ -45,7 +45,7 @@ - + +