Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.
/ corefx Public archive

Commit

Permalink
Add Utf16 Encode() to S.T.Encoding.Web and uptake in S.T.Json
Browse files Browse the repository at this point in the history
  • Loading branch information
steveharter committed Jul 31, 2019
1 parent 739f443 commit 9fbd6ee
Show file tree
Hide file tree
Showing 7 changed files with 328 additions and 419 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ protected TextEncoder() { }
public virtual void Encode(System.IO.TextWriter output, char[] value, int startIndex, int characterCount) { }
public void Encode(System.IO.TextWriter output, string value) { }
public virtual void Encode(System.IO.TextWriter output, string value, int startIndex, int characterCount) { }
public virtual System.Buffers.OperationStatus Encode(System.ReadOnlySpan<char> source, System.Span<char> destination, out int charsConsumed, out int charsWritten, bool isFinalBlock = true) { throw null; }
public virtual string Encode(string value) { throw null; }
public virtual System.Buffers.OperationStatus EncodeUtf8(System.ReadOnlySpan<byte> utf8Source, System.Span<byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { throw null; }
[System.CLSCompliantAttribute(false)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ public unsafe override int FindFirstCharacterToEncode(char* text, int textLength
{
throw new ArgumentNullException(nameof(text));
}

return _allowedCharacters.FindFirstCharacterToEncode(text, textLength);
}

Expand Down Expand Up @@ -160,7 +161,7 @@ public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buff

if (!WillEncode(unicodeScalar)) { return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); }

char[] toCopy = null;
char[] toCopy;
switch (unicodeScalar)
{
case '\b': toCopy = s_b; break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ namespace System.Text.Encodings.Web
public abstract class TextEncoder
{
// Fast cache for Ascii
#pragma warning disable CA1825 // should not be Array.Empty<byte> as this is used as a singleton for comparisons
private static readonly byte[] s_noEscape = new byte[] { };
#pragma warning restore CA1825
private byte[][] _asciiEscape = new byte[0x80][];


// Keep a reference to Array.Empty<byte> as this is used as a singleton for comparisons
// and there is no guarantee that Array.Empty<byte>() will always be the same instance.
private static readonly byte[] s_noEscape = Array.Empty<byte>();

// The following pragma disables a warning complaining about non-CLS compliant members being abstract,
// and wants me to mark the type as non-CLS compliant.
// It is true that this type cannot be extended by all CLS compliant languages.
Expand Down Expand Up @@ -111,15 +112,25 @@ public virtual string Encode(string value)
if (bufferSize < 1024)
{
char* wholebuffer = stackalloc char[bufferSize];
int totalWritten = EncodeIntoBuffer(wholebuffer, bufferSize, valuePointer, value.Length, firstCharacterToEncode);
OperationStatus status = EncodeIntoBuffer(wholebuffer, bufferSize, valuePointer, value.Length, out int _, out int totalWritten, firstCharacterToEncode);
if (status != OperationStatus.Done)
{
ThrowArgumentException_MaxOutputCharsPerInputChar();
}

result = new string(wholebuffer, 0, totalWritten);
}
else
{
char[] wholebuffer = new char[bufferSize];
fixed (char* buffer = &wholebuffer[0])
{
int totalWritten = EncodeIntoBuffer(buffer, bufferSize, valuePointer, value.Length, firstCharacterToEncode);
OperationStatus status = EncodeIntoBuffer(buffer, bufferSize, valuePointer, value.Length, out int _, out int totalWritten, firstCharacterToEncode);
if (status != OperationStatus.Done)
{
ThrowArgumentException_MaxOutputCharsPerInputChar();
}

result = new string(wholebuffer, 0, totalWritten);
}
}
Expand All @@ -129,12 +140,18 @@ public virtual string Encode(string value)
}
}

// NOTE: The order of the parameters to this method is a work around for https://github.com/dotnet/corefx/issues/4455
// and the underlying Mono bug: https://bugzilla.xamarin.com/show_bug.cgi?id=36052.
// If changing the signature of this method, ensure this issue isn't regressing on Mono.
private unsafe int EncodeIntoBuffer(char* buffer, int bufferLength, char* value, int valueLength, int firstCharacterToEncode)
private unsafe OperationStatus EncodeIntoBuffer(
char* buffer,
int bufferLength,
char* value,
int valueLength,
out int charsConsumed,
out int charsWritten,
int firstCharacterToEncode,
bool isFinalBlock = true)
{
int totalWritten = 0;
char* originalBuffer = buffer;
charsWritten = 0;

if (firstCharacterToEncode > 0)
{
Expand All @@ -144,7 +161,7 @@ private unsafe int EncodeIntoBuffer(char* buffer, int bufferLength, char* value,
destinationSizeInBytes: sizeof(char) * bufferLength,
sourceBytesToCopy: sizeof(char) * firstCharacterToEncode);

totalWritten += firstCharacterToEncode;
charsWritten += firstCharacterToEncode;
bufferLength -= firstCharacterToEncode;
buffer += firstCharacterToEncode;
}
Expand All @@ -154,7 +171,6 @@ private unsafe int EncodeIntoBuffer(char* buffer, int bufferLength, char* value,
char firstChar = value[valueIndex];
char secondChar = firstChar;
bool wasSurrogatePair = false;
int charsWritten;

// this loop processes character pairs (in case they are surrogates).
// there is an if block below to process single last character.
Expand All @@ -169,6 +185,7 @@ private unsafe int EncodeIntoBuffer(char* buffer, int bufferLength, char* value,
{
firstChar = value[secondCharIndex - 1];
}

secondChar = value[secondCharIndex];

if (!WillEncode(firstChar))
Expand All @@ -177,41 +194,51 @@ private unsafe int EncodeIntoBuffer(char* buffer, int bufferLength, char* value,
*buffer = firstChar;
buffer++;
bufferLength--;
totalWritten++;
charsWritten++;
}
else
{
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair);
if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out charsWritten))
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair, out bool _);
if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out int charsWrittenThisTime))
{
throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly.");
charsConsumed = (int)(originalBuffer - buffer);
return OperationStatus.DestinationTooSmall;
}

buffer += charsWritten;
bufferLength -= charsWritten;
totalWritten += charsWritten;
if (wasSurrogatePair)
{
secondCharIndex++;
}

buffer += charsWrittenThisTime;
bufferLength -= charsWrittenThisTime;
charsWritten += charsWrittenThisTime;
}
}

if (secondCharIndex == valueLength)
{
firstChar = value[valueLength - 1];
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair);
if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out charsWritten))
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair, out bool wasUnmatchedSurrogate);
if (!isFinalBlock && wasUnmatchedSurrogate)
{
throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly.");
charsConsumed = (int)(buffer - originalBuffer);
return OperationStatus.NeedMoreData;
}

buffer += charsWritten;
bufferLength -= charsWritten;
totalWritten += charsWritten;
if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out int charsWrittenThisTime))
{
charsConsumed = (int)(buffer - originalBuffer);
return OperationStatus.DestinationTooSmall;
}

buffer += charsWrittenThisTime;
bufferLength -= charsWrittenThisTime;
charsWritten += charsWrittenThisTime;
}

return totalWritten;
charsConsumed = valueLength;
return OperationStatus.Done;
}

/// <summary>
Expand Down Expand Up @@ -250,7 +277,7 @@ public virtual void Encode(TextWriter output, string value, int startIndex, int
char* substring = valuePointer + startIndex;
int firstIndexToEncode = FindFirstCharacterToEncode(substring, characterCount);

if (firstIndexToEncode == -1) // nothing to encode;
if (firstIndexToEncode == -1) // nothing to encode;
{
if (startIndex == 0 && characterCount == value.Length) // write whole string
{
Expand Down Expand Up @@ -303,7 +330,7 @@ public virtual void Encode(TextWriter output, char[] value, int startIndex, int
char* substring = valuePointer + startIndex;
int firstIndexToEncode = FindFirstCharacterToEncode(substring, characterCount);

if (firstIndexToEncode == -1) // nothing to encode;
if (firstIndexToEncode == -1) // nothing to encode;
{
if (startIndex == 0 && characterCount == value.Length) // write whole string
{
Expand Down Expand Up @@ -342,7 +369,12 @@ public virtual void Encode(TextWriter output, char[] value, int startIndex, int
/// <see langword="false"/> if there is no further source data that needs to be encoded.</param>
/// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns>
/// <remarks>The buffers <paramref name="utf8Source"/> and <paramref name="utf8Destination"/> must not overlap.</remarks>
public unsafe virtual OperationStatus EncodeUtf8(ReadOnlySpan<byte> utf8Source, Span<byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true)
public unsafe virtual OperationStatus EncodeUtf8(
ReadOnlySpan<byte> utf8Source,
Span<byte> utf8Destination,
out int bytesConsumed,
out int bytesWritten,
bool isFinalBlock = true)
{
int originalUtf8SourceLength = utf8Source.Length;
int originalUtf8DestinationLength = utf8Destination.Length;
Expand Down Expand Up @@ -464,6 +496,7 @@ public unsafe virtual OperationStatus EncodeUtf8(ReadOnlySpan<byte> utf8Source,
bytesWritten = originalUtf8DestinationLength - utf8Destination.Length;
return OperationStatus.NeedMoreData;
}
// else treat this as a normal invalid subsequence.
}
else if (opStatus == OperationStatus.DestinationTooSmall)
{
Expand Down Expand Up @@ -524,6 +557,34 @@ internal static OperationStatus EncodeUtf8Shim(TextEncoder encoder, ReadOnlySpan
return encoder.EncodeUtf8(utf8Source, utf8Destination, out bytesConsumed, out bytesWritten, isFinalBlock);
}

/// <summary>
/// Encodes the supplied characters.
/// </summary>
/// <param name="source">A source buffer containing the characters text to encode.</param>
/// <param name="destination">The destination buffer to which the encoded form of <paramref name="source"/>
/// will be written.</param>
/// <param name="charsConsumed">The number of characters consumed from the <paramref name="source"/> buffer.</param>
/// <param name="charsWritten">The number of characters written to the <paramref name="destination"/> buffer.</param>
/// <param name="isFinalBlock"><see langword="true"/> if there is further source data that needs to be encoded;
/// <see langword="false"/> if there is no further source data that needs to be encoded.</param>
/// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns>
/// <remarks>The buffers <paramref name="source"/> and <paramref name="destination"/> must not overlap.</remarks>
public virtual OperationStatus Encode(
ReadOnlySpan<char> source,
Span<char> destination,
out int charsConsumed,
out int charsWritten,
bool isFinalBlock = true)
{
unsafe
{
fixed (char* sourcePtr = source)
fixed (char* destinationPtr = destination)

return EncodeIntoBuffer(destinationPtr, destination.Length, sourcePtr, source.Length, out charsConsumed, out charsWritten, 0, isFinalBlock);
}
}

private unsafe void EncodeCore(TextWriter output, char* value, int valueLength)
{
Debug.Assert(value != null & output != null);
Expand Down Expand Up @@ -559,10 +620,10 @@ private unsafe void EncodeCore(TextWriter output, char* value, int valueLength)
}
else
{
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair);
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair, out bool _);
if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out charsWritten))
{
throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly.");
ThrowArgumentException_MaxOutputCharsPerInputChar();
}
Write(output, buffer, charsWritten);

Expand All @@ -576,10 +637,10 @@ private unsafe void EncodeCore(TextWriter output, char* value, int valueLength)
if (!wasSurrogatePair || (secondCharIndex == valueLength))
{
firstChar = value[valueLength - 1];
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair);
int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair, out bool _);
if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out charsWritten))
{
throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly.");
ThrowArgumentException_MaxOutputCharsPerInputChar();
}
Write(output, buffer, charsWritten);
}
Expand Down Expand Up @@ -719,12 +780,17 @@ private byte[] GetAsciiEncoding(byte value)
{
if (!WillEncode(value))
{
_asciiEscape[value] = s_noEscape;
return s_noEscape;
encoding = s_noEscape;
_asciiEscape[value] = encoding;
}
}

return encoding;
}

private static void ThrowArgumentException_MaxOutputCharsPerInputChar()
{
throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly.");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -309,23 +309,25 @@ internal static ReadOnlySpan<uint> GetDefinedCharacterBitmap()
/// Set 'endOfString' to true if 'pChar' points to the last character in the stream.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair)
internal static int GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair, out bool wasUnmatchedSurrogate)
{
if (!char.IsSurrogate(first))
{
wasSurrogatePair = false;
wasUnmatchedSurrogate = false;
return first;
}
return GetScalarValueFromUtf16Slow(first, second, out wasSurrogatePair);
return GetScalarValueFromUtf16Slow(first, second, out wasSurrogatePair, out wasUnmatchedSurrogate);
}

private static int GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair)
private static int GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair, out bool wasUnmatchedSurrogate)
{
#if DEBUG
if (!Char.IsSurrogate(first))
if (!char.IsSurrogate(first))
{
Debug.Assert(false, "This case should've been handled by the fast path.");
wasSurrogatePair = false;
wasUnmatchedSurrogate = false;
return first;
}
#endif
Expand All @@ -337,19 +339,22 @@ private static int GetScalarValueFromUtf16Slow(char first, char? second, out boo
{
// valid surrogate pair - extract codepoint
wasSurrogatePair = true;
wasUnmatchedSurrogate = false;
return GetScalarValueFromUtf16SurrogatePair(first, second.Value);
}
else
{
// unmatched surrogate - substitute
wasSurrogatePair = false;
wasUnmatchedSurrogate = true;
return UNICODE_REPLACEMENT_CHAR;
}
}
else
{
// unmatched surrogate - substitute
wasSurrogatePair = false;
wasUnmatchedSurrogate = true;
return UNICODE_REPLACEMENT_CHAR;
}
}
Expand All @@ -358,6 +363,7 @@ private static int GetScalarValueFromUtf16Slow(char first, char? second, out boo
// unmatched surrogate - substitute
Debug.Assert(char.IsLowSurrogate(first));
wasSurrogatePair = false;
wasUnmatchedSurrogate = true;
return UNICODE_REPLACEMENT_CHAR;
}
}
Expand Down
Loading

0 comments on commit 9fbd6ee

Please sign in to comment.