-
Notifications
You must be signed in to change notification settings - Fork 4.1k
/
Copy pathCaseInsensitiveComparison.cs
402 lines (344 loc) · 14.1 KB
/
CaseInsensitiveComparison.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Text;
using Microsoft.CodeAnalysis.PooledObjects;
using Roslyn.Utilities;
namespace Microsoft.CodeAnalysis
{
/// <summary>
/// Case-insensitive operations (mostly comparison) on unicode strings.
/// </summary>
#if COMPILERCORE
public
#else
internal
#endif
static class CaseInsensitiveComparison
{
// PERF: Cache a TextInfo for Unicode ToLower since this will be accessed very frequently
private static readonly TextInfo s_unicodeCultureTextInfo = GetUnicodeCulture().TextInfo;
private static CultureInfo GetUnicodeCulture()
{
try
{
// We use the "en" culture to get the Unicode ToLower mapping, as it implements
// a much more recent Unicode version (6.0+) than the invariant culture (1.0),
// and it matches the Unicode version used for character categorization.
return new CultureInfo("en");
}
catch (ArgumentException) // System.Globalization.CultureNotFoundException not on all platforms
{
// If "en" is not available, fall back to the invariant culture. Although it has bugs
// specific to the invariant culture (e.g. being version-locked to Unicode 1.0), at least
// we can rely on it being present on all platforms.
return CultureInfo.InvariantCulture;
}
}
/// <summary>
/// ToLower implements the Unicode lowercase mapping
/// as described in ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt.
/// VB uses these mappings for case-insensitive comparison.
/// </summary>
/// <param name="c"></param>
/// <returns>If <paramref name="c"/> is upper case, then this returns its Unicode lower case equivalent. Otherwise, <paramref name="c"/> is returned unmodified.</returns>
public static char ToLower(char c)
{
// PERF: This is a very hot code path in VB, optimize for ASCII
// Perform a range check with a single compare by using unsigned arithmetic
if (unchecked((uint)(c - 'A')) <= ('Z' - 'A'))
{
return (char)(c | 0x20);
}
if (c < 0xC0) // Covers ASCII (U+0000 - U+007F) and up to the next upper-case codepoint (Latin Capital Letter A with Grave)
{
return c;
}
return ToLowerNonAscii(c);
}
private static char ToLowerNonAscii(char c)
{
if (c == '\u0130')
{
// Special case Turkish I (LATIN CAPITAL LETTER I WITH DOT ABOVE)
// This corrects for the fact that the invariant culture only supports Unicode 1.0
// and therefore does not "know about" this character.
return 'i';
}
return s_unicodeCultureTextInfo.ToLower(c);
}
/// <summary>
/// This class seeks to perform the lowercase Unicode case mapping.
/// </summary>
private sealed class OneToOneUnicodeComparer : StringComparer
{
private static int CompareLowerUnicode(char c1, char c2)
{
return (c1 == c2) ? 0 : ToLower(c1) - ToLower(c2);
}
public override int Compare(string? str1, string? str2)
{
if ((object?)str1 == str2)
{
return 0;
}
if (str1 is null)
{
return -1;
}
if (str2 is null)
{
return 1;
}
int len = Math.Min(str1.Length, str2.Length);
for (int i = 0; i < len; i++)
{
int ordDiff = CompareLowerUnicode(str1[i], str2[i]);
if (ordDiff != 0)
{
return ordDiff;
}
}
// return the smaller string, or 0 if they are equal in length
return str1.Length - str2.Length;
}
public int Compare(ReadOnlySpan<char> str1, ReadOnlySpan<char> str2)
{
int len = Math.Min(str1.Length, str2.Length);
for (int i = 0; i < len; i++)
{
int ordDiff = CompareLowerUnicode(str1[i], str2[i]);
if (ordDiff != 0)
{
return ordDiff;
}
}
// return the smaller string, or 0 if they are equal in length
return str1.Length - str2.Length;
}
private static bool AreEqualLowerUnicode(char c1, char c2)
{
return c1 == c2 || ToLower(c1) == ToLower(c2);
}
public override bool Equals(string? str1, string? str2)
{
if ((object?)str1 == str2)
{
return true;
}
if (str1 is null || str2 is null)
{
return false;
}
if (str1.Length != str2.Length)
{
return false;
}
for (int i = 0; i < str1.Length; i++)
{
if (!AreEqualLowerUnicode(str1[i], str2[i]))
{
return false;
}
}
return true;
}
public bool Equals(ReadOnlySpan<char> str1, ReadOnlySpan<char> str2)
{
if (str1.Length != str2.Length)
{
return false;
}
for (int i = 0; i < str1.Length; i++)
{
if (!AreEqualLowerUnicode(str1[i], str2[i]))
{
return false;
}
}
return true;
}
public static bool EndsWith(string value, string possibleEnd)
{
if ((object)value == possibleEnd)
{
return true;
}
if ((object)value == null || (object)possibleEnd == null)
{
return false;
}
int i = value.Length - 1;
int j = possibleEnd.Length - 1;
if (i < j)
{
return false;
}
while (j >= 0)
{
if (!AreEqualLowerUnicode(value[i], possibleEnd[j]))
{
return false;
}
i--;
j--;
}
return true;
}
public static bool StartsWith(string value, string possibleStart)
{
if ((object)value == possibleStart)
{
return true;
}
if ((object)value == null || (object)possibleStart == null)
{
return false;
}
if (value.Length < possibleStart.Length)
{
return false;
}
for (int i = 0; i < possibleStart.Length; i++)
{
if (!AreEqualLowerUnicode(value[i], possibleStart[i]))
{
return false;
}
}
return true;
}
public override int GetHashCode(string str)
{
int hashCode = Hash.FnvOffsetBias;
for (int i = 0; i < str.Length; i++)
{
hashCode = Hash.CombineFNVHash(hashCode, ToLower(str[i]));
}
return hashCode;
}
}
/// <summary>
/// Returns a StringComparer that compares strings according to Unicode rules for case-insensitive
/// identifier comparison (lower-case mapping).
/// </summary>
/// <remarks>
/// These are also the rules used for VB identifier comparison.
/// </remarks>
private static readonly OneToOneUnicodeComparer s_comparer = new OneToOneUnicodeComparer();
/// <summary>
/// Returns a StringComparer that compares strings according to Unicode rules for case-insensitive
/// identifier comparison (lower-case mapping).
/// </summary>
/// <remarks>
/// These are also the rules used for VB identifier comparison.
/// </remarks>
public static StringComparer Comparer => s_comparer;
/// <summary>
/// Determines if two strings are equal according to Unicode rules for case-insensitive
/// identifier comparison (lower-case mapping).
/// </summary>
/// <param name="left">First identifier to compare</param>
/// <param name="right">Second identifier to compare</param>
/// <returns>true if the identifiers should be considered the same.</returns>
/// <remarks>
/// These are also the rules used for VB identifier comparison.
/// </remarks>
public static bool Equals(string left, string right) => s_comparer.Equals(left, right);
/// <summary>
/// Determines if two strings are equal according to Unicode rules for case-insensitive
/// identifier comparison (lower-case mapping).
/// </summary>
/// <param name="left">First identifier to compare</param>
/// <param name="right">Second identifier to compare</param>
/// <returns>true if the identifiers should be considered the same.</returns>
/// <remarks>
/// These are also the rules used for VB identifier comparison.
/// </remarks>
public static bool Equals(ReadOnlySpan<char> left, ReadOnlySpan<char> right) => s_comparer.Equals(left, right);
/// <summary>
/// Determines if the string 'value' end with string 'possibleEnd'.
/// </summary>
/// <param name="value"></param>
/// <param name="possibleEnd"></param>
/// <returns></returns>
public static bool EndsWith(string value, string possibleEnd) => OneToOneUnicodeComparer.EndsWith(value, possibleEnd);
/// <summary>
/// Determines if the string 'value' starts with string 'possibleStart'.
/// </summary>
/// <param name="value"></param>
/// <param name="possibleStart"></param>
/// <returns></returns>
public static bool StartsWith(string value, string possibleStart) => OneToOneUnicodeComparer.StartsWith(value, possibleStart);
/// <summary>
/// Compares two strings according to the Unicode rules for case-insensitive
/// identifier comparison (lower-case mapping).
/// </summary>
/// <param name="left">First identifier to compare</param>
/// <param name="right">Second identifier to compare</param>
/// <returns>-1 if <paramref name="left"/> < <paramref name="right"/>, 1 if <paramref name="left"/> > <paramref name="right"/>, 0 if they are equal.</returns>
/// <remarks>
/// These are also the rules used for VB identifier comparison.
/// </remarks>
public static int Compare(string left, string right) => s_comparer.Compare(left, right);
/// <summary>
/// Compares two strings according to the Unicode rules for case-insensitive
/// identifier comparison (lower-case mapping).
/// </summary>
/// <param name="left">First identifier to compare</param>
/// <param name="right">Second identifier to compare</param>
/// <returns>-1 if <paramref name="left"/> < <paramref name="right"/>, 1 if <paramref name="left"/> > <paramref name="right"/>, 0 if they are equal.</returns>
/// <remarks>
/// These are also the rules used for VB identifier comparison.
/// </remarks>
public static int Compare(ReadOnlySpan<char> left, ReadOnlySpan<char> right) => s_comparer.Compare(left, right);
/// <summary>
/// Gets a case-insensitive hash code for Unicode identifiers.
/// </summary>
/// <param name="value">identifier to get the hash code for</param>
/// <returns>The hash code for the given identifier</returns>
/// <remarks>
/// These are also the rules used for VB identifier comparison.
/// </remarks>
public static int GetHashCode(string value)
{
RoslynDebug.Assert(value != null);
return s_comparer.GetHashCode(value);
}
/// <summary>
/// Convert a string to lower case per Unicode
/// </summary>
/// <param name="value"></param>
/// <returns></returns>
[return: NotNullIfNotNull(parameterName: nameof(value))]
public static string? ToLower(string? value)
{
if (value is null)
return null;
if (value.Length == 0)
return value;
var pooledStrbuilder = PooledStringBuilder.GetInstance();
StringBuilder builder = pooledStrbuilder.Builder;
builder.Append(value);
ToLower(builder);
return pooledStrbuilder.ToStringAndFree();
}
/// <summary>
/// In-place convert string in StringBuilder to lower case per Unicode rules
/// </summary>
/// <param name="builder"></param>
public static void ToLower(StringBuilder builder)
{
if (builder == null)
return;
for (int i = 0; i < builder.Length; i++)
{
builder[i] = ToLower(builder[i]);
}
}
}
}