Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.
/ corefx Public archive

Commit

Permalink
Add Encode(Span<char>) API (#39900)
Browse files Browse the repository at this point in the history
For performance adds the new API for encoding char. Also changes the semantics of System.Text.Json to use the replacement character for Utf16 for bad surrogate pairs instead of throwing - this makes it consistent with Utf8 and the default behavior of System.Text.Encoding.Json.
  • Loading branch information
steveharter authored Aug 6, 2019
1 parent 69df8ab commit 0cb8c78
Show file tree
Hide file tree
Showing 9 changed files with 540 additions and 525 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ protected TextEncoder() { }
public virtual void Encode(System.IO.TextWriter output, char[] value, int startIndex, int characterCount) { }
public void Encode(System.IO.TextWriter output, string value) { }
public virtual void Encode(System.IO.TextWriter output, string value, int startIndex, int characterCount) { }
public virtual System.Buffers.OperationStatus Encode(System.ReadOnlySpan<char> source, System.Span<char> destination, out int charsConsumed, out int charsWritten, bool isFinalBlock = true) { throw null; }
public virtual string Encode(string value) { throw null; }
public virtual System.Buffers.OperationStatus EncodeUtf8(System.ReadOnlySpan<byte> utf8Source, System.Span<byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { throw null; }
[System.CLSCompliantAttribute(false)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ public unsafe override int FindFirstCharacterToEncode(char* text, int textLength
{
throw new ArgumentNullException(nameof(text));
}

return _allowedCharacters.FindFirstCharacterToEncode(text, textLength);
}

Expand Down Expand Up @@ -160,7 +161,7 @@ public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buff

if (!WillEncode(unicodeScalar)) { return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); }

char[] toCopy = null;
char[] toCopy;
switch (unicodeScalar)
{
case '\b': toCopy = s_b; break;
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -47,71 +47,6 @@ private static uint[] CreateDefinedCharacterBitmapMachineEndian()
return bigEndianData;
}

/// <summary>
/// A copy of the logic in Rune.DecodeFromUtf16.
/// </summary>
public static OperationStatus DecodeScalarValueFromUtf16(ReadOnlySpan<char> source, out uint result, out int charsConsumed)
{
const char ReplacementChar = '\uFFFD';

if (!source.IsEmpty)
{
// First, check for the common case of a BMP scalar value.
// If this is correct, return immediately.

uint firstChar = source[0];
if (!UnicodeUtility.IsSurrogateCodePoint(firstChar))
{
result = firstChar;
charsConsumed = 1;
return OperationStatus.Done;
}

// First thing we saw was a UTF-16 surrogate code point.
// Let's optimistically assume for now it's a high surrogate and hope
// that combining it with the next char yields useful results.

if (1 < (uint)source.Length)
{
uint secondChar = source[1];
if (UnicodeUtility.IsHighSurrogateCodePoint(firstChar) && UnicodeUtility.IsLowSurrogateCodePoint(secondChar))
{
// Success! Formed a supplementary scalar value.
result = UnicodeUtility.GetScalarFromUtf16SurrogatePair(firstChar, secondChar);
charsConsumed = 2;
return OperationStatus.Done;
}
else
{
// Either the first character was a low surrogate, or the second
// character was not a low surrogate. This is an error.
goto InvalidData;
}
}
else if (!UnicodeUtility.IsHighSurrogateCodePoint(firstChar))
{
// Quick check to make sure we're not going to report NeedMoreData for
// a single-element buffer where the data is a standalone low surrogate
// character. Since no additional data will ever make this valid, we'll
// report an error immediately.
goto InvalidData;
}
}

// If we got to this point, the input buffer was empty, or the buffer
// was a single element in length and that element was a high surrogate char.

charsConsumed = source.Length;
result = ReplacementChar;
return OperationStatus.NeedMoreData;

InvalidData:

charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length
result = ReplacementChar;
return OperationStatus.InvalidData;
}

/// <summary>
/// A copy of the logic in Rune.DecodeFromUtf8.
/// </summary>
Expand Down Expand Up @@ -309,23 +244,26 @@ internal static ReadOnlySpan<uint> GetDefinedCharacterBitmap()
/// Set 'endOfString' to true if 'pChar' points to the last character in the stream.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair)
internal static int GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair, out bool needsMoreData)
{
if (!char.IsSurrogate(first))
{
wasSurrogatePair = false;
needsMoreData = false;
return first;
}
return GetScalarValueFromUtf16Slow(first, second, out wasSurrogatePair);

return GetScalarValueFromUtf16Slow(first, second, out wasSurrogatePair, out needsMoreData);
}

private static int GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair)
private static int GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair, out bool needMoreData)
{
#if DEBUG
if (!char.IsSurrogate(first))
{
Debug.Assert(false, "This case should've been handled by the fast path.");
wasSurrogatePair = false;
needMoreData = false;
return first;
}
#endif
Expand All @@ -337,19 +275,22 @@ private static int GetScalarValueFromUtf16Slow(char first, char? second, out boo
{
// valid surrogate pair - extract codepoint
wasSurrogatePair = true;
needMoreData = false;
return GetScalarValueFromUtf16SurrogatePair(first, second.Value);
}
else
{
// unmatched surrogate - substitute
wasSurrogatePair = false;
needMoreData = false;
return UNICODE_REPLACEMENT_CHAR;
}
}
else
{
// unmatched surrogate - substitute
wasSurrogatePair = false;
needMoreData = true; // Last character was high surrogate; we need more data.
return UNICODE_REPLACEMENT_CHAR;
}
}
Expand All @@ -358,6 +299,7 @@ private static int GetScalarValueFromUtf16Slow(char first, char? second, out boo
// unmatched surrogate - substitute
Debug.Assert(char.IsLowSurrogate(first));
wasSurrogatePair = false;
needMoreData = false;
return UNICODE_REPLACEMENT_CHAR;
}
}
Expand Down
204 changes: 200 additions & 4 deletions src/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Buffers;
using System.Globalization;
using System.IO;
using System.Linq;
Expand All @@ -17,12 +18,121 @@ public partial class JavaScriptStringEncoderTests
[Fact]
public void TestSurrogate()
{
// Encode(string)
Assert.Equal("\\uD83D\\uDCA9", System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode("\U0001f4a9"));

// Encode(writer, string)
using (var writer = new StringWriter())
{
System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(writer, "\U0001f4a9");
Assert.Equal("\\uD83D\\uDCA9", writer.GetStringBuilder().ToString());
}

// Encode(Span, ...)
Span<char> destination = new char[12];
OperationStatus status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
"\U0001f4a9".AsSpan(), destination, out int charsConsumed, out int charsWritten, isFinalBlock: true);

Assert.Equal(OperationStatus.Done, status);
Assert.Equal(2, charsConsumed);
Assert.Equal(12, charsWritten);
Assert.Equal("\\uD83D\\uDCA9", new string(destination.Slice(0, charsWritten).ToArray()));
}

[Fact]
public void TestSurrogateBufferDoesNotUnderOrOverWrite()
{
Span<char> destination = new char[212];
destination[99] = 'x';
destination[112] = 'x';

// Pass in destination + 100 to check for underwrite.
OperationStatus status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
"\U0001f4a9".AsSpan(), destination.Slice(100, 12), out int charsConsumed, out int charsWritten, isFinalBlock: true);

Assert.Equal(OperationStatus.Done, status);
Assert.Equal(2, charsConsumed);
Assert.Equal(12, charsWritten);
Assert.Equal('x', destination[99]);
Assert.Equal('x', destination[112]);
}

[Fact]
public void TestSurrogateBufferOverlaps()
{
Span<char> destination = new char[100];
"\U0001f4a9".AsSpan().CopyTo(destination);

// Overlap behavior is undefined but documented that it is not valid. Here we don't expect any issues.
OperationStatus status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
destination.Slice(0, 2), destination, out int charsConsumed, out int charsWritten, isFinalBlock: true);

Assert.Equal(OperationStatus.Done, status);
Assert.Equal(2, charsConsumed);
Assert.Equal(12, charsWritten);
}

[Fact]
public void TestSurrogateBufferTooSmall()
{
Span<char> destination = new char[11];
OperationStatus status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
"\U0001f4a9".AsSpan(), destination, out int charsConsumed, out int charsWritten, isFinalBlock: true);

Assert.Equal(OperationStatus.DestinationTooSmall, status);
Assert.Equal(0, charsConsumed);
Assert.Equal(0, charsWritten);
}

[Fact]
public void JavaScriptStringEncoder_NonEmptySource_EmptyDest_Throws()
{
OperationStatus status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
"\U0001f4a9".AsSpan(), destination: null, out int _, out int _, isFinalBlock: true);

Assert.Equal(OperationStatus.DestinationTooSmall, status);
}

[Fact]
public void JavaScriptStringEncoder_EmptySource_EmptyDest()
{
OperationStatus status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
"".AsSpan(), destination: null, out int _, out int _, isFinalBlock: true);

Assert.Equal(OperationStatus.Done, status);
}

[Fact]
public void TestEmptySourceEncode()
{
// Encode(string)
Assert.Equal("", System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(""));

// Encode(writer, string)
using (var writer = new StringWriter())
{
System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(writer, "");
Assert.Equal("", writer.GetStringBuilder().ToString());
}

// Encode(Span, ...)
Span<char> destination = new char[12];
OperationStatus status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
"".AsSpan(), destination, out int charsConsumed, out int charsWritten, isFinalBlock: true);

Assert.Equal(OperationStatus.Done, status);
Assert.Equal(0, charsConsumed);
Assert.Equal(0, charsWritten);
Assert.Equal("", new string(destination.Slice(0, charsWritten).ToArray()));

destination = null; // null doesn't throw is no characters to encode
status = System.Text.Encodings.Web.JavaScriptEncoder.Default.Encode(
"".AsSpan(), destination, out charsConsumed, out charsWritten, isFinalBlock: true);

Assert.Equal(OperationStatus.Done, status);
Assert.Equal(0, charsConsumed);
Assert.Equal(0, charsWritten);
Assert.Equal("", new string(destination.Slice(0, charsWritten).ToArray()));
}

[Fact]
Expand Down Expand Up @@ -229,14 +339,100 @@ public void JavaScriptStringEncode_BadSurrogates_ReturnsUnicodeReplacementChar()
JavaScriptStringEncoder encoder = new JavaScriptStringEncoder(UnicodeRanges.All); // allow all codepoints

// "a<unpaired leading>b<unpaired trailing>c<trailing before leading>d<unpaired trailing><valid>e<high at end of string>"
const string input = "a\uD800b\uDFFFc\uDFFF\uD800d\uDFFF\uD800\uDFFFe\uD800";
const string expected = "a\uFFFDb\uFFFDc\uFFFD\uFFFDd\uFFFD\\uD800\\uDFFFe\uFFFD"; // 'D800' 'DFFF' was preserved since it's valid
const string Input = "a\uD800b\uDFFFc\uDFFF\uD800d\uDFFF\uD800\uDFFFe\uD800";
const string Expected = "a\uFFFDb\uFFFDc\uFFFD\uFFFDd\uFFFD\\uD800\\uDFFFe\uFFFD"; // 'D800' 'DFFF' was preserved since it's valid

// Act
string retVal = encoder.JavaScriptStringEncode(input);
string retVal = encoder.JavaScriptStringEncode(Input);

// Assert
Assert.Equal(expected, retVal);
Assert.Equal(Expected, retVal);
}

[Fact]
public void JavaScriptEncoder_BadSurrogates_ReturnsUnicodeReplacementChar()
{
// Arrange
JavaScriptEncoder encoder = JavaScriptEncoder.Create(UnicodeRanges.All); // allow all codepoints

// "a<unpaired leading>b<unpaired trailing>c<trailing before leading>d<unpaired trailing><valid>e<high at end of string>"
const string Input = "a\uD800b\uDFFFc\uDFFF\uD800d\uDFFF\uD800\uDFFFe\uD800";
const string Expected = "a\uFFFDb\uFFFDc\uFFFD\uFFFDd\uFFFD\\uD800\\uDFFFe\uFFFD"; // 'D800' 'DFFF' was preserved since it's valid

// String-based Encode()
string retVal = encoder.Encode(Input);
Assert.Equal(Expected, retVal);

// OperationStatus-based Encode()
Span<char> destination = new char[23];
OperationStatus status = encoder.Encode(Input.AsSpan(), destination, out int charsConsumed, out int charsWritten, isFinalBlock: true);
Assert.Equal(OperationStatus.Done, status);
Assert.Equal(13, charsConsumed);
Assert.Equal(13, Input.Length);
Assert.Equal(23, charsWritten);
Assert.Equal(Expected, new string(destination.Slice(0, charsWritten).ToArray()));
}

[Fact]
public void JavaScriptEncoder_UnpairedSurrogatesReplaced()
{
// Arrange
JavaScriptEncoder encoder = JavaScriptEncoder.Create(UnicodeRanges.All); // allow all codepoints

// "a<unpaired leading low><unpaired leading high><unpaired leading high>"
const string Input = "a\uDFFF\uD800\uD800";
const string Expected = "a\uFFFD\uFFFD\uFFFD";

Assert.Equal(4, Input.Length);

// String-based Encode()
string retVal = encoder.Encode(Input);
Assert.Equal(Expected, retVal);

// OperationStatus-based Encode()
OperationStatus status;
Span<char> destination = new char[100];
status = encoder.Encode(Input.AsSpan(), destination, out int charsConsumed, out int charsWritten, isFinalBlock: true);
Assert.Equal(OperationStatus.Done, status);
Assert.Equal(4, charsConsumed);
Assert.Equal(4, charsWritten);
Assert.Equal(Expected, new string(destination.Slice(0, charsWritten).ToArray()));
}

[Fact]
public void JavaScriptEncoder_NeedsMoreData()
{
// "a<paired leading><paired trailing>"
const string Input = "a\uD800\uDFFF";
const string Expected = "a\\uD800\\uDFFF";

Assert.Equal(3, Input.Length);

JavaScriptEncoder encoder = JavaScriptEncoder.Create(UnicodeRanges.All); // allow all codepoints
Span<char> destination = new char[100];

OperationStatus status;

// Just pass in the first two characters, making uD800 an unpaired high surrogate. Set isFinalBlock=false so we get NeedMoreData.
status = encoder.Encode(Input.AsSpan(0, 2), destination, out int charsConsumed1, out int charsWritten1, isFinalBlock: false);
Assert.Equal(OperationStatus.NeedMoreData, status);
Assert.Equal(1, charsConsumed1);
Assert.Equal(1, charsWritten1);
Assert.Equal("a", new string(destination.Slice(0, charsWritten1).ToArray()));

// Append additional data; keep IsFinalBlock=false
status = encoder.Encode(Input.AsSpan(charsConsumed1, 2), destination.Slice(charsWritten1), out int charsConsumed2, out int charsWritten2, isFinalBlock: false);
Assert.Equal(OperationStatus.Done, status);
Assert.Equal(2, charsConsumed2);
Assert.Equal(12, charsWritten2);
Assert.Equal(Expected, new string(destination.Slice(0, charsWritten1 + charsWritten2).ToArray()));

// Ensure isFinalBlock=true has the same result since there is no longer a trailing unpaired high surrogate.
status = encoder.Encode(Input.AsSpan(charsConsumed1, 2), destination.Slice(charsWritten1), out charsConsumed2, out charsWritten2, isFinalBlock: true);
Assert.Equal(OperationStatus.Done, status);
Assert.Equal(2, charsConsumed2);
Assert.Equal(12, charsWritten2);
Assert.Equal(Expected, new string(destination.Slice(0, charsWritten1 + charsWritten2).ToArray()));
}

[Fact]
Expand Down
Loading

0 comments on commit 0cb8c78

Please sign in to comment.