Skip to content

Commit

Permalink
Fix incorrect handling of character range and capitalization in regex (
Browse files Browse the repository at this point in the history
…#42282)

* Bug fix and unit test

* Address comments

* An elegant fix and unit tests

* Remove old code

* Fix the unit tests

* sq

* sq

* Address comments and add test case

* Skip the unit tests on net framework

* Debug commit for CI

* Try to get more error info

* Run validation as a unit test.

Skip it on non-ICU environments
  • Loading branch information
Prashanth Govindarajan authored Nov 2, 2020
1 parent c3c768b commit 7c9c347
Show file tree
Hide file tree
Showing 6 changed files with 253 additions and 164 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<ItemGroup>
<Compile Include="System\Collections\HashtableExtensions.cs" />
<Compile Include="System\Collections\Generic\ValueListBuilder.Pop.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCharClass.MappingTable.cs" />
<Compile Include="System\Text\SegmentStringBuilder.cs" />
<Compile Include="System\Text\RegularExpressions\Capture.cs" />
<Compile Include="System\Text\RegularExpressions\CaptureCollection.cs" />
Expand Down Expand Up @@ -46,14 +47,10 @@
<Compile Include="System\Text\RegularExpressions\RegexCompiler.cs" />
<Compile Include="System\Text\RegularExpressions\RegexLWCGCompiler.cs" />
<!-- Common or Common-branched source files -->
<Compile Include="$(CommonPath)System\NotImplemented.cs"
Link="Common\System\NotImplemented.cs" />
<Compile Include="$(CommonPath)System\HexConverter.cs"
Link="Common\System\HexConverter.cs" />
<Compile Include="$(CoreLibSharedDir)System\Collections\Generic\ValueListBuilder.cs"
Link="Common\System\Collections\Generic\ValueListBuilder.cs" />
<Compile Include="$(CommonPath)System\Text\ValueStringBuilder.cs"
Link="Common\System\Text\ValueStringBuilder.cs" />
<Compile Include="$(CommonPath)System\NotImplemented.cs" Link="Common\System\NotImplemented.cs" />
<Compile Include="$(CommonPath)System\HexConverter.cs" Link="Common\System\HexConverter.cs" />
<Compile Include="$(CoreLibSharedDir)System\Collections\Generic\ValueListBuilder.cs" Link="Common\System\Collections\Generic\ValueListBuilder.cs" />
<Compile Include="$(CommonPath)System\Text\ValueStringBuilder.cs" Link="Common\System\Text\ValueStringBuilder.cs" />
</ItemGroup>
<ItemGroup>
<Reference Include="System.Collections" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;

namespace System.Text.RegularExpressions
{
internal sealed partial class RegexCharClass
{
/**************************************************************************
Let U be the set of Unicode character values and let L be the lowercase
function, mapping from U to U. To perform case insensitive matching of
character sets, we need to be able to map an interval I in U, say
I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
to a set A such that A contains L(I) and A is contained in the union of
I and L(I).
The table below partitions U into intervals on which L is non-decreasing.
Thus, for any interval J = [a, b] contained in one of these intervals,
L(J) is contained in [L(a), L(b)].
It is also true that for any such J, [L(a), L(b)] is contained in the
union of J and L(J). This does not follow from L being non-decreasing on
these intervals. It follows from the nature of the L on each interval.
On each interval, L has one of the following forms:
(1) L(ch) = constant (LowercaseSet)
(2) L(ch) = ch + offset (LowercaseAdd)
(3) L(ch) = ch | 1 (LowercaseBor)
(4) L(ch) = ch + (ch & 1) (LowercaseBad)
It is easy to verify that for any of these forms [L(a), L(b)] is
contained in the union of [a, b] and L([a, b]).
***************************************************************************/

internal const int LowercaseSet = 0; // Set to arg.
internal const int LowercaseAdd = 1; // Add arg.
internal const int LowercaseBor = 2; // Bitwise or with 1.
internal const int LowercaseBad = 3; // Bitwise and with 1 and add original.

internal static readonly LowerCaseMapping[] s_lcTable = new LowerCaseMapping[]
{
new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32),
new LowerCaseMapping('\u00C0', '\u00D6', LowercaseAdd, 32),
new LowerCaseMapping('\u00D8', '\u00DE', LowercaseAdd, 32),
new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0),
new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0),
new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0),
new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0),
new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF),
new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0),
new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253),
new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0),
new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254),
new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188),
new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205),
new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C),
new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD),
new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259),
new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B),
new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192),
new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260),
new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263),
new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269),
new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268),
new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199),
new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F),
new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272),
new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275),
new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0),
new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8),
new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283),
new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD),
new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288),
new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0),
new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217),
new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0),
new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292),
new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9),
new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD),
new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6),
new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9),
new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC),
new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0),
new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0),
new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3),
new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5),
new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0),
new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC),
new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37),
new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC),
new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63),
new LowerCaseMapping('\u0391', '\u03A1', LowercaseAdd, 32),
new LowerCaseMapping('\u03A3', '\u03AB', LowercaseAdd, 32),
new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0),
new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80),
new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32),
new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0),
new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0),
new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0),
new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8),
new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC),
new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0),
new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0),
new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9),
new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48),
new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 7264),
new LowerCaseMapping('\u1E00', '\u1E95', LowercaseBor, 0),
new LowerCaseMapping('\u1EA0', '\u1EF8', LowercaseBor, 0),
new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F18', '\u1F1D', LowercaseAdd, -8),
new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8),
new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51),
new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53),
new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55),
new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57),
new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8),
new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8),
new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8),
new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74),
new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3),
new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86),
new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3),
new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8),
new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100),
new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8),
new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112),
new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5),
new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128),
new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126),
new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3),
new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16),
new LowerCaseMapping('\u24B6', '\u24CF', LowercaseAdd, 26),
new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32),
};

/// <summary>
/// Lower case mapping descriptor.
/// </summary>
internal readonly struct LowerCaseMapping
{
public readonly char ChMin;
public readonly char ChMax;
public readonly int LcOp;
public readonly int Data;

internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data)
{
ChMin = chMin;
ChMax = chMax;
LcOp = lcOp;
Data = data;
}
}
}
}
Loading

0 comments on commit 7c9c347

Please sign in to comment.