-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Switch to iSimdVector and Align WidenAsciiToUtf16 #99982
Changes from all commits
707e56c
4b4f15f
2c7d258
90c5092
424c52a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B | |
|
||
if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128<byte>.Count) | ||
{ | ||
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; | ||
|
||
if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512<byte>.Count) | ||
if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512<byte>.Count) | ||
{ | ||
// Calculating the destination address outside the loop results in significant | ||
// perf wins vs. relying on the JIT to fold memory addressing logic into the | ||
// write instructions. See: https://github.com/dotnet/runtime/issues/33002 | ||
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512<byte>.Count; | ||
|
||
do | ||
{ | ||
Vector512<byte> asciiVector = Vector512.Load(pAsciiBuffer + currentOffset); | ||
|
||
if (asciiVector.ExtractMostSignificantBits() != 0) | ||
{ | ||
break; | ||
} | ||
|
||
(Vector512<ushort> utf16LowVector, Vector512<ushort> utf16HighVector) = Vector512.Widen(asciiVector); | ||
utf16LowVector.Store(pCurrentWriteAddress); | ||
utf16HighVector.Store(pCurrentWriteAddress + Vector512<ushort>.Count); | ||
|
||
currentOffset += (nuint)Vector512<byte>.Count; | ||
pCurrentWriteAddress += (nuint)Vector512<byte>.Count; | ||
} while (currentOffset <= finalOffsetWhereCanRunLoop); | ||
WidenAsciiToUtf1_Vector<Vector512<byte>, Vector512<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); | ||
} | ||
else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count) | ||
else if (Vector256.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector256<byte>.Count) | ||
{ | ||
// Calculating the destination address outside the loop results in significant | ||
// perf wins vs. relying on the JIT to fold memory addressing logic into the | ||
// write instructions. See: https://github.com/dotnet/runtime/issues/33002 | ||
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256<byte>.Count; | ||
|
||
do | ||
{ | ||
Vector256<byte> asciiVector = Vector256.Load(pAsciiBuffer + currentOffset); | ||
|
||
if (asciiVector.ExtractMostSignificantBits() != 0) | ||
{ | ||
break; | ||
} | ||
|
||
(Vector256<ushort> utf16LowVector, Vector256<ushort> utf16HighVector) = Vector256.Widen(asciiVector); | ||
utf16LowVector.Store(pCurrentWriteAddress); | ||
utf16HighVector.Store(pCurrentWriteAddress + Vector256<ushort>.Count); | ||
|
||
currentOffset += (nuint)Vector256<byte>.Count; | ||
pCurrentWriteAddress += (nuint)Vector256<byte>.Count; | ||
} while (currentOffset <= finalOffsetWhereCanRunLoop); | ||
WidenAsciiToUtf1_Vector<Vector256<byte>, Vector256<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); | ||
} | ||
else | ||
else if (Vector128.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector128<byte>.Count) | ||
{ | ||
// Calculating the destination address outside the loop results in significant | ||
// perf wins vs. relying on the JIT to fold memory addressing logic into the | ||
// write instructions. See: https://github.com/dotnet/runtime/issues/33002 | ||
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128<byte>.Count; | ||
|
||
do | ||
{ | ||
Vector128<byte> asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); | ||
|
||
if (VectorContainsNonAsciiChar(asciiVector)) | ||
{ | ||
break; | ||
} | ||
|
||
(Vector128<ushort> utf16LowVector, Vector128<ushort> utf16HighVector) = Vector128.Widen(asciiVector); | ||
utf16LowVector.Store(pCurrentWriteAddress); | ||
utf16HighVector.Store(pCurrentWriteAddress + Vector128<ushort>.Count); | ||
|
||
currentOffset += (nuint)Vector128<byte>.Count; | ||
pCurrentWriteAddress += (nuint)Vector128<byte>.Count; | ||
} while (currentOffset <= finalOffsetWhereCanRunLoop); | ||
WidenAsciiToUtf1_Vector<Vector128<byte>, Vector128<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); | ||
} | ||
} | ||
|
||
|
@@ -2212,6 +2150,85 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B | |
goto Finish; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static unsafe void WidenAsciiToUtf1_Vector<TVectorByte, TVectorUInt16>(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount) | ||
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte> | ||
where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort> | ||
{ | ||
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; | ||
// Calculating the destination address outside the loop results in significant | ||
// perf wins vs. relying on the JIT to fold memory addressing logic into the | ||
// write instructions. See: https://github.com/dotnet/runtime/issues/33002 | ||
nuint finalOffsetWhereCanRunLoop = elementCount - (nuint)TVectorByte.Count; | ||
TVectorByte asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset); | ||
if (!HasMatch<TVectorByte>(asciiVector)) | ||
{ | ||
(TVectorUInt16 utf16LowVector, TVectorUInt16 utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector); | ||
utf16LowVector.Store(pCurrentWriteAddress); | ||
utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count); | ||
pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2); | ||
if (((nuint)pCurrentWriteAddress % sizeof(char)) == 0) | ||
{ | ||
// Bump write buffer up to the next aligned boundary | ||
pCurrentWriteAddress = (ushort*)((nuint)pCurrentWriteAddress & ~(nuint)(TVectorUInt16.Alignment - 1)); | ||
nuint numBytesWritten = (nuint)pCurrentWriteAddress - (nuint)pUtf16Buffer; | ||
currentOffset += (nuint)numBytesWritten / 2; | ||
} | ||
else | ||
{ | ||
// If input isn't char aligned, we won't be able to align it to a Vector | ||
currentOffset += (nuint)TVectorByte.Count; | ||
} | ||
while (currentOffset <= finalOffsetWhereCanRunLoop) | ||
{ | ||
asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset); | ||
if (HasMatch<TVectorByte>(asciiVector)) | ||
{ | ||
break; | ||
} | ||
(utf16LowVector, utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector); | ||
utf16LowVector.Store(pCurrentWriteAddress); | ||
utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count); | ||
|
||
currentOffset += (nuint)TVectorByte.Count; | ||
pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2); | ||
} | ||
} | ||
return; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static unsafe bool HasMatch<TVectorByte>(TVectorByte vector) | ||
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte> | ||
{ | ||
return !(vector & TVectorByte.Create((byte)0x80)).Equals(TVectorByte.Zero); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I basically ran into a weird issue where perf degraded significantly for specific cases on my ICX when I had '(vector & TVectorByte.Create((byte)0x80)) != TVectorByte.Zero' and the issue went away with what I have now. It was pretty consistent. I had some trouble narrowing down the exact why with VTune though. I decided to go with the performant version for now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was there a codegen difference between them? I'd expect them to generate the same code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did a quick check on godbolt and they all look the same - https://godbolt.org/z/P87dPdTGa Now I'm curious if it was just something off when I ran it locally There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have a strong preference for any of these patterns? I can look into the why this is happening if it's important. For now, I'm just keeping the fast version There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tannergooding I have created an issue detailing this as discussed: #100493 Please let me know if there is anything else needed for this PR |
||
} | ||
|
||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static unsafe (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen<TVectorByte, TVectorUInt16>(TVectorByte vector) | ||
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte> | ||
where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort> | ||
{ | ||
if (typeof(TVectorByte) == typeof(Vector256<byte>)) | ||
{ | ||
(Vector256<ushort> Lower256, Vector256<ushort> Upper256) = Vector256.Widen((Vector256<byte>)(object)vector); | ||
return ((TVectorUInt16)(object)Lower256, (TVectorUInt16)(object)Upper256); | ||
} | ||
else if (typeof(TVectorByte) == typeof(Vector512<byte>)) | ||
{ | ||
(Vector512<ushort> Lower512, Vector512<ushort> Upper512) = Vector512.Widen((Vector512<byte>)(object)vector); | ||
return ((TVectorUInt16)(object)Lower512, (TVectorUInt16)(object)Upper512); | ||
} | ||
else | ||
{ | ||
Debug.Assert(typeof(TVectorByte) == typeof(Vector128<byte>)); | ||
(Vector128<ushort> Lower128, Vector128<ushort> Upper128) = Vector128.Widen((Vector128<byte>)(object)vector); | ||
return ((TVectorUInt16)(object)Lower128, (TVectorUInt16)(object)Upper128); | ||
} | ||
} | ||
|
||
|
||
/// <summary> | ||
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and | ||
/// writes them to the output buffer with machine endianness. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: unnecessary parens here and in
Any
If you could fix that in a follow up PR, that'd be great (going to merge this)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good! Will put it up later today.