diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/ISimdVector_2.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/ISimdVector_2.cs index a1d49045bb2ac..a88b5f0dec6a1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/ISimdVector_2.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/ISimdVector_2.cs @@ -553,6 +553,19 @@ static virtual bool TryCopyTo(TSelf vector, Span destination) // New Surface Area // + /// Checks if any of the vector lanes are equivalent to value. + /// The Vector. + /// The Value to check. + /// true if has any lanes equivalent to otherwise, false if none of the lanes are equivalent to />. + /// The type of the elements in the vector () is not supported. + static abstract bool Any(TSelf vector, T value); + + /// Checks if any of the vector lanes have All Bits set. + /// The Vector to check. + /// true if has any lanes with All Bits set otherwise, false if none of the lanes have All Bits set />. + /// The type of the elements in the vector () is not supported. + static abstract bool AnyWhereAllBitsSet(TSelf vector); + static abstract int IndexOfLastMatch(TSelf vector); } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs index 42bef9fe0ed94..efa2f1f148b92 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs @@ -692,6 +692,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri // New Surface Area // + static bool ISimdVector, T>.AnyWhereAllBitsSet(Vector128 vector) + { + return (Vector128.EqualsAny(vector, Vector128.AllBitsSet)); + } + + static bool ISimdVector, T>.Any(Vector128 vector, T value) + { + return (Vector128.EqualsAny(vector, Vector128.Create((T)value))); + } + static int ISimdVector, T>.IndexOfLastMatch(Vector128 vector) { uint mask = vector.ExtractMostSignificantBits(); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs index fd1e889e44f24..af789a873535c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs @@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri // New Surface Area // + static bool ISimdVector, T>.AnyWhereAllBitsSet(Vector256 vector) + { + return (Vector256.EqualsAny(vector, Vector256.AllBitsSet)); + } + + static bool ISimdVector, T>.Any(Vector256 vector, T value) + { + return (Vector256.EqualsAny(vector, Vector256.Create((T)value))); + } + static int ISimdVector, T>.IndexOfLastMatch(Vector256 vector) { uint mask = vector.ExtractMostSignificantBits(); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512_1.cs index 5c3f6a593ebfe..c6e42c63c44f5 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512_1.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512_1.cs @@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri // New Surface Area // + static bool ISimdVector, T>.AnyWhereAllBitsSet(Vector512 vector) + { + return (Vector512.EqualsAny(vector, Vector512.AllBitsSet)); + } + + static bool ISimdVector, T>.Any(Vector512 vector, T value) + { + return (Vector512.EqualsAny(vector, Vector512.Create((T)value))); + } + static int ISimdVector, T>.IndexOfLastMatch(Vector512 vector) { ulong mask = vector.ExtractMostSignificantBits(); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64_1.cs index 3ec5a675e61bc..241a2c294e4e3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64_1.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64_1.cs @@ -757,6 +757,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri // New Surface Area // + static bool ISimdVector, T>.AnyWhereAllBitsSet(Vector64 vector) + { + return (Vector64.EqualsAny(vector, Vector64.AllBitsSet)); + } + + static bool ISimdVector, T>.Any(Vector64 vector, T value) + { + return (Vector64.EqualsAny(vector, Vector64.Create((T)value))); + } + static int ISimdVector, T>.IndexOfLastMatch(Vector64 vector) { uint mask = vector.ExtractMostSignificantBits(); diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index ba07b985b6705..76d2701b0eaab 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { - ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; - - if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512.Count) + if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512.Count) { - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512.Count; - - do - { - Vector512 asciiVector = Vector512.Load(pAsciiBuffer + currentOffset); - - if (asciiVector.ExtractMostSignificantBits() != 0) - { - break; - } - - (Vector512 utf16LowVector, Vector512 utf16HighVector) = Vector512.Widen(asciiVector); - utf16LowVector.Store(pCurrentWriteAddress); - utf16HighVector.Store(pCurrentWriteAddress + Vector512.Count); - - currentOffset += (nuint)Vector512.Count; - pCurrentWriteAddress += (nuint)Vector512.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); + WidenAsciiToUtf1_Vector, Vector512>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } - else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) + else if (Vector256.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector256.Count) { - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256.Count; - - do - { - Vector256 asciiVector = Vector256.Load(pAsciiBuffer + currentOffset); - - if (asciiVector.ExtractMostSignificantBits() != 0) - { - break; - } - - (Vector256 utf16LowVector, Vector256 utf16HighVector) = Vector256.Widen(asciiVector); - utf16LowVector.Store(pCurrentWriteAddress); - utf16HighVector.Store(pCurrentWriteAddress + Vector256.Count); - - currentOffset += (nuint)Vector256.Count; - pCurrentWriteAddress += (nuint)Vector256.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); + WidenAsciiToUtf1_Vector, Vector256>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } - else + else if (Vector128.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector128.Count) { - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128.Count; - - do - { - Vector128 asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); - - if (VectorContainsNonAsciiChar(asciiVector)) - { - break; - } - - (Vector128 utf16LowVector, Vector128 utf16HighVector) = Vector128.Widen(asciiVector); - utf16LowVector.Store(pCurrentWriteAddress); - utf16HighVector.Store(pCurrentWriteAddress + Vector128.Count); - - currentOffset += (nuint)Vector128.Count; - pCurrentWriteAddress += (nuint)Vector128.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); + WidenAsciiToUtf1_Vector, Vector128>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } } @@ -2212,6 +2150,85 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B goto Finish; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WidenAsciiToUtf1_Vector(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount) + where TVectorByte : unmanaged, ISimdVector + where TVectorUInt16 : unmanaged, ISimdVector + { + ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; + // Calculating the destination address outside the loop results in significant + // perf wins vs. relying on the JIT to fold memory addressing logic into the + // write instructions. See: https://github.com/dotnet/runtime/issues/33002 + nuint finalOffsetWhereCanRunLoop = elementCount - (nuint)TVectorByte.Count; + TVectorByte asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset); + if (!HasMatch(asciiVector)) + { + (TVectorUInt16 utf16LowVector, TVectorUInt16 utf16HighVector) = Widen(asciiVector); + utf16LowVector.Store(pCurrentWriteAddress); + utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count); + pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2); + if (((nuint)pCurrentWriteAddress % sizeof(char)) == 0) + { + // Bump write buffer up to the next aligned boundary + pCurrentWriteAddress = (ushort*)((nuint)pCurrentWriteAddress & ~(nuint)(TVectorUInt16.Alignment - 1)); + nuint numBytesWritten = (nuint)pCurrentWriteAddress - (nuint)pUtf16Buffer; + currentOffset += (nuint)numBytesWritten / 2; + } + else + { + // If input isn't char aligned, we won't be able to align it to a Vector + currentOffset += (nuint)TVectorByte.Count; + } + while (currentOffset <= finalOffsetWhereCanRunLoop) + { + asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset); + if (HasMatch(asciiVector)) + { + break; + } + (utf16LowVector, utf16HighVector) = Widen(asciiVector); + utf16LowVector.Store(pCurrentWriteAddress); + utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count); + + currentOffset += (nuint)TVectorByte.Count; + pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2); + } + } + return; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe bool HasMatch(TVectorByte vector) + where TVectorByte : unmanaged, ISimdVector + { + return !(vector & TVectorByte.Create((byte)0x80)).Equals(TVectorByte.Zero); + } + + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen(TVectorByte vector) + where TVectorByte : unmanaged, ISimdVector + where TVectorUInt16 : unmanaged, ISimdVector + { + if (typeof(TVectorByte) == typeof(Vector256)) + { + (Vector256 Lower256, Vector256 Upper256) = Vector256.Widen((Vector256)(object)vector); + return ((TVectorUInt16)(object)Lower256, (TVectorUInt16)(object)Upper256); + } + else if (typeof(TVectorByte) == typeof(Vector512)) + { + (Vector512 Lower512, Vector512 Upper512) = Vector512.Widen((Vector512)(object)vector); + return ((TVectorUInt16)(object)Lower512, (TVectorUInt16)(object)Upper512); + } + else + { + Debug.Assert(typeof(TVectorByte) == typeof(Vector128)); + (Vector128 Lower128, Vector128 Upper128) = Vector128.Widen((Vector128)(object)vector); + return ((TVectorUInt16)(object)Lower128, (TVectorUInt16)(object)Upper128); + } + } + + /// /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and /// writes them to the output buffer with machine endianness.