From a5136765a90168dbaa3b7e87b507c479d27540e5 Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 17 Jul 2023 15:23:26 -0700 Subject: [PATCH] Light up core ASCII.Utility methods with Vector256/Vector512 code paths. (#88532) * Lib upgrade for ToUtf16 * Upgrade NarrowUtf16ToAscii with Vector512 * Complete the upgrade in NarrowUtf16ToAscii method with Vector512 and Vector256 APIs. * Adding VectorXX paths to `GetIndexOfFirstNonAscii` functions. * Adding optimization to Vecto256 VectorContainsNonAsciiChar method. * Code path refactoring and cleanup. * Code changes based on the review: 1. turn some variables into explicitly specified const. 2. removed some helper functions and inlined them. * Resolve comments * revert the changes at GetIndexOfFirstNonAsciiByte --------- Co-authored-by: Ruihan-Yin --- .../src/System/Text/Ascii.Utility.cs | 852 +++++++++++++++++++-- .../tests/Ascii/FromUtf16Tests.cs | 16 +- .../tests/Ascii/ToUtf16Tests.cs | 16 +- 3 files changed, 808 insertions(+), 76 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 69b3cc1..e30b3d0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -101,9 +101,238 @@ namespace System.Text // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. - return (Sse2.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) - ? GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength) - : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength); + if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated) + { + return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); + } + else if (Sse2.IsSupported || (AdvSimd.IsSupported && BitConverter.IsLittleEndian)) + { + return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength); + } + else + { + return GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength); + } + } + + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nuint bufferLength) + { + // Squirrel away the original buffer reference. This method works by determining the exact + // byte reference where non-ASCII data begins, so we need this base value to perform the + // final subtraction at the end of the method to get the index into the original buffer. + + byte* pOriginalBuffer = pBuffer; + + // Before we drain off byte-by-byte, try a generic vectorized loop. + // Only run the loop if we have at least two vectors we can pull out. + // Note use of SBYTE instead of BYTE below; we're using the two's-complement + // representation of negative integers to act as a surrogate for "is ASCII?". + + if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) + { + + if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector512.Size; + pBuffer = (byte*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1)); + +#if DEBUG + long numBytesRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numBytesRead && numBytesRead <= Vector512.Size, "We should've made forward progress of at least one byte."); + Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); + if (Vector512.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) + { + break; // found non-ASCII data + } + + pBuffer += Vector512.Size; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= (nuint)pBuffer; + bufferLength += (nuint)pOriginalBuffer; + } + } + else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) + { + + if (Vector256.Load(pBuffer).ExtractMostSignificantBits() == 0) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector256.Size; + pBuffer = (byte*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1)); + +#if DEBUG + long numBytesRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numBytesRead && numBytesRead <= Vector256.Size, "We should've made forward progress of at least one byte."); + Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); + if (Vector256.LoadAligned(pBuffer).ExtractMostSignificantBits() != 0) + { + break; // found non-ASCII data + } + + pBuffer += Vector256.Size; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= (nuint)pBuffer; + bufferLength += (nuint)pOriginalBuffer; + } + } + else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) + { + + if (!VectorContainsNonAsciiChar(Vector128.Load(pBuffer))) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + byte* pFinalVectorReadPos = pBuffer + bufferLength - Vector128.Size; + pBuffer = (byte*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); + +#if DEBUG + long numBytesRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numBytesRead && numBytesRead <= Vector128.Size, "We should've made forward progress of at least one byte."); + Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned."); + if (VectorContainsNonAsciiChar(Vector128.LoadAligned(pBuffer))) + { + break; // found non-ASCII data + } + + pBuffer += Vector128.Size; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= (nuint)pBuffer; + bufferLength += (nuint)pOriginalBuffer; + } + } + + // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform + // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code + // path to drain any remaining ASCII bytes. + // + // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads. + // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes. + + uint currentUInt32; + + // Try reading 64 bits at a time in a loop. + + for (; bufferLength >= 8; bufferLength -= 8) + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + uint nextUInt32 = Unsafe.ReadUnaligned(pBuffer + 4); + + if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32)) + { + // One of these two values contains non-ASCII bytes. + // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes. + + if (AllBytesInUInt32AreAscii(currentUInt32)) + { + currentUInt32 = nextUInt32; + pBuffer += 4; + } + + goto FoundNonAsciiData; + } + + pBuffer += 8; // consumed 8 ASCII bytes + } + + // From this point forward we don't need to update bufferLength. + // Try reading 32 bits. + + if ((bufferLength & 4) != 0) + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + if (!AllBytesInUInt32AreAscii(currentUInt32)) + { + goto FoundNonAsciiData; + } + + pBuffer += 4; + } + + // Try reading 16 bits. + + if ((bufferLength & 2) != 0) + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + if (!AllBytesInUInt32AreAscii(currentUInt32)) + { + if (!BitConverter.IsLittleEndian) + { + currentUInt32 <<= 16; + } + goto FoundNonAsciiData; + } + + pBuffer += 2; + } + + // Try reading 8 bits + + if ((bufferLength & 1) != 0) + { + // If the buffer contains non-ASCII data, the comparison below will fail, and + // we'll end up not incrementing the buffer reference. + + if (*(sbyte*)pBuffer >= 0) + { + pBuffer++; + } + } + + Finish: + + nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer; + return totalNumBytesRead; + + FoundNonAsciiData: + + Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input."); + + // The method being called doesn't bother looking at whether the high byte is ASCII. There are only + // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before + // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be + // non-ASCII. In both cases we only care about the low 24 bits. + + pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32); + goto Finish; } private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength) @@ -609,39 +838,251 @@ namespace System.Text pBuffer += 2; // successfully consumed 2 ASCII bytes } - // BYTE drain + // BYTE drain + + if ((bufferLength & 1) != 0) + { + // sbyte has non-negative value if byte is ASCII. + + if (*(sbyte*)(pBuffer) >= 0) + { + pBuffer++; // successfully consumed a single byte + } + } + + goto Finish; + } + + /// + /// Returns the index in where the first non-ASCII char is found. + /// Returns if the buffer is empty or all-ASCII. + /// + /// An ASCII char is defined as 0x0000 - 0x007F, inclusive. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */) + { + // If SSE2/ASIMD is supported, use those specific intrinsics instead of the generic vectorized + // code below. This has two benefits: (a) we can take advantage of specific instructions like + // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while + // this method is running. + + if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated) + { + return GetIndexOfFirstNonAsciiChar_Vector(pBuffer, bufferLength); + } + else if (Sse2.IsSupported || (AdvSimd.IsSupported && BitConverter.IsLittleEndian)) + { + return GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength); + } + else + { + return GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength); + } + } + + private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nuint bufferLength /* in chars */) + { + // Squirrel away the original buffer reference.This method works by determining the exact + // char reference where non-ASCII data begins, so we need this base value to perform the + // final subtraction at the end of the method to get the index into the original buffer. + char* pOriginalBuffer = pBuffer; + +#if SYSTEM_PRIVATE_CORELIB + Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); +#endif + + // Before we drain off char-by-char, try a generic vectorized loop. + // Only run the loop if we have at least two vectors we can pull out. + if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) + { + const uint SizeOfVector512InChars = Vector512.Size / sizeof(ushort); + + if (!VectorContainsNonAsciiChar(Vector512.Load((ushort*)pBuffer))) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector512InChars; + pBuffer = (char*)(((nuint)pBuffer + Vector512.Size) & ~(nuint)(Vector512.Size - 1)); + +#if DEBUG + long numCharsRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector512InChars, "We should've made forward progress of at least one char."); + Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % Vector512.Size == 0, "Vector read should be aligned."); + if (VectorContainsNonAsciiChar(Vector512.LoadAligned((ushort*)pBuffer))) + { + break; // found non-ASCII data + } + pBuffer += SizeOfVector512InChars; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); + } + } + else if (Vector256.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector256.Count) + { + const uint SizeOfVector256InChars = Vector256.Size / sizeof(ushort); + + if (!VectorContainsNonAsciiChar(Vector256.Load((ushort*)pBuffer))) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + + char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector256InChars; + pBuffer = (char*)(((nuint)pBuffer + Vector256.Size) & ~(nuint)(Vector256.Size - 1)); + +#if DEBUG + long numCharsRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector256InChars, "We should've made forward progress of at least one char."); + Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % Vector256.Size == 0, "Vector read should be aligned."); + if (VectorContainsNonAsciiChar(Vector256.LoadAligned((ushort*)pBuffer))) + { + break; // found non-ASCII data + } + pBuffer += SizeOfVector256InChars; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); + } + } + else if (Vector128.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector128.Count) + { + const uint SizeOfVector128InChars = Vector128.Size / sizeof(ushort); // JIT will make this a const + + if (!VectorContainsNonAsciiChar(Vector128.Load((ushort*)pBuffer))) + { + // The first several elements of the input buffer were ASCII. Bump up the pointer to the + // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII + // data or we approach the end of the buffer. It's possible we'll reread data; this is ok. + char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVector128InChars; + pBuffer = (char*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); + +#if DEBUG + long numCharsRead = pBuffer - pOriginalBuffer; + Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char."); + Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); +#endif + + Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector."); + + do + { + Debug.Assert((nuint)pBuffer % Vector128.Size == 0, "Vector read should be aligned."); + if (VectorContainsNonAsciiChar(Vector128.LoadAligned((ushort*)pBuffer))) + { + break; // found non-ASCII data + } + pBuffer += SizeOfVector128InChars; + } while (pBuffer <= pFinalVectorReadPos); + + // Adjust the remaining buffer length for the number of elements we just consumed. + + bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); + } + } + + + // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform + // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code + // path to drain any remaining ASCII chars. + // + // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads. + // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars. + + uint currentUInt32; + + // Try reading 64 bits at a time in a loop. + + for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + uint nextUInt32 = Unsafe.ReadUnaligned(pBuffer + 4 / sizeof(char)); + + if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32)) + { + // One of these two values contains non-ASCII chars. + // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars. + + if (AllCharsInUInt32AreAscii(currentUInt32)) + { + currentUInt32 = nextUInt32; + pBuffer += 2; + } + + goto FoundNonAsciiData; + } + + pBuffer += 4; // consumed 4 ASCII chars + } + + // From this point forward we don't need to keep track of the remaining buffer length. + // Try reading 32 bits. + + if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars + { + currentUInt32 = Unsafe.ReadUnaligned(pBuffer); + if (!AllCharsInUInt32AreAscii(currentUInt32)) + { + goto FoundNonAsciiData; + } + + pBuffer += 2; + } + + // Try reading 16 bits. + // No need to try an 8-bit read after this since we're working with chars. + + if ((bufferLength & 1) != 0) + { + // If the buffer contains non-ASCII data, the comparison below will fail, and + // we'll end up not incrementing the buffer reference. + + if (*pBuffer <= 0x007F) + { + pBuffer++; + } + } + + Finish: + + nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer; + Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars."); + return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning + + FoundNonAsciiData: + + Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input."); + + // We don't bother looking at the second char - only the first char. - if ((bufferLength & 1) != 0) + if (FirstCharInUInt32IsAscii(currentUInt32)) { - // sbyte has non-negative value if byte is ASCII. - - if (*(sbyte*)(pBuffer) >= 0) - { - pBuffer++; // successfully consumed a single byte - } + pBuffer++; } goto Finish; } - /// - /// Returns the index in where the first non-ASCII char is found. - /// Returns if the buffer is empty or all-ASCII. - /// - /// An ASCII char is defined as 0x0000 - 0x007F, inclusive. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */) - { - // If SSE2/ASIMD is supported, use those specific intrinsics instead of the generic vectorized - // code below. This has two benefits: (a) we can take advantage of specific instructions like - // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while - // this method is running. - - return ((Sse2.IsSupported || AdvSimd.IsSupported) && BitConverter.IsLittleEndian) - ? GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength) - : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength); - } - private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */) { // Squirrel away the original buffer reference.This method works by determining the exact @@ -683,7 +1124,7 @@ namespace System.Text do { - Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned."); + Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned."); if (Vector.GreaterThanAny(Unsafe.Read>(pBuffer), maxAscii)) { break; // found non-ASCII data @@ -792,8 +1233,7 @@ namespace System.Text // JIT turns the below into constants - uint SizeOfVector128InBytes = (uint)sizeof(Vector128); - uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char); + uint SizeOfVector128InChars = Vector128.Size / sizeof(char); Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Should've been checked by caller."); Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 assumes little-endian."); @@ -831,14 +1271,14 @@ namespace System.Text bufferLength <<= 1; // chars to bytes - if (bufferLength < 2 * SizeOfVector128InBytes) + if (bufferLength < 2 * Vector128.Size) { goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead; } // Now adjust the read pointer so that future reads are aligned. - pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); + pBuffer = (char*)(((nuint)pBuffer + Vector128.Size) & ~(nuint)(Vector128.Size - 1)); #if DEBUG long numCharsRead = pBuffer - pOriginalBuffer; @@ -853,9 +1293,9 @@ namespace System.Text // The buffer is now properly aligned. // Read 2 vectors at a time if possible. - if (bufferLength >= 2 * SizeOfVector128InBytes) + if (bufferLength >= 2 * Vector128.Size) { - char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes); + char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * Vector128.Size); // After this point, we no longer need to update the bufferLength value. do @@ -884,7 +1324,7 @@ namespace System.Text // If there is fewer than one vector length remaining, skip the next aligned read. // Remember, at this point bufferLength is measured in bytes, not chars. - if ((bufferLength & SizeOfVector128InBytes) == 0) + if ((bufferLength & Vector128.Size) == 0) { goto DoFinalUnalignedVectorRead; } @@ -904,12 +1344,12 @@ namespace System.Text DoFinalUnalignedVectorRead: - if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0) + if (((byte)bufferLength & (Vector128.Size - 1)) != 0) { // Perform an unaligned read of the last vector. // We need to adjust the pointer because we're re-reading data. - pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes); + pBuffer = (char*)((byte*)pBuffer + (bufferLength & (Vector128.Size - 1)) - Vector128.Size); firstVector = Vector128.LoadUnsafe(ref *(ushort*)pBuffer); if (VectorContainsNonAsciiChar(firstVector)) { @@ -1172,32 +1612,39 @@ namespace System.Text uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0; ulong utf16Data64Bits = 0; - if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian) + if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian && elementCount >= 2 * (uint)Vector128.Count) { - if (elementCount >= 2 * (uint)Vector128.Count) - { - // Since there's overhead to setting up the vectorized code path, we only want to - // call into it after a quick probe to ensure the next immediate characters really are ASCII. - // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method. + // Since there's overhead to setting up the vectorized code path, we only want to + // call into it after a quick probe to ensure the next immediate characters really are ASCII. + // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method. - if (IntPtr.Size >= 8) + if (IntPtr.Size >= 8) + { + utf16Data64Bits = Unsafe.ReadUnaligned(pUtf16Buffer); + if (!AllCharsInUInt64AreAscii(utf16Data64Bits)) { - utf16Data64Bits = Unsafe.ReadUnaligned(pUtf16Buffer); - if (!AllCharsInUInt64AreAscii(utf16Data64Bits)) - { - goto FoundNonAsciiDataIn64BitRead; - } + goto FoundNonAsciiDataIn64BitRead; } - else + } + else + { + utf16Data32BitsHigh = Unsafe.ReadUnaligned(pUtf16Buffer); + utf16Data32BitsLow = Unsafe.ReadUnaligned(pUtf16Buffer + 4 / sizeof(char)); + if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow)) { - utf16Data32BitsHigh = Unsafe.ReadUnaligned(pUtf16Buffer); - utf16Data32BitsLow = Unsafe.ReadUnaligned(pUtf16Buffer + 4 / sizeof(char)); - if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow)) - { - goto FoundNonAsciiDataIn64BitRead; - } + goto FoundNonAsciiDataIn64BitRead; } - + } + if (Vector512.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector512.Count) + { + currentOffset = NarrowUtf16ToAscii_Intrinsified_512(pUtf16Buffer, pAsciiBuffer, elementCount); + } + else if (Vector256.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector256.Count) + { + currentOffset = NarrowUtf16ToAscii_Intrinsified_256(pUtf16Buffer, pAsciiBuffer, elementCount); + } + else + { currentOffset = NarrowUtf16ToAscii_Intrinsified(pUtf16Buffer, pAsciiBuffer, elementCount); } } @@ -1445,6 +1892,32 @@ namespace System.Text } [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector256 utf16Vector) + { + if (Avx.IsSupported) + { + Vector256 asciiMaskForTestZ = Vector256.Create((ushort)0xFF80); + return !Avx.TestZ(utf16Vector.AsInt16(), asciiMaskForTestZ.AsInt16()); + } + else + { + const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80 + Vector256 zeroIsAscii = utf16Vector & Vector256.Create(asciiMask); + // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data. + return zeroIsAscii != Vector256.Zero; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool VectorContainsNonAsciiChar(Vector512 utf16Vector) + { + const ushort asciiMask = ushort.MaxValue - 127; // 0xFF80 + Vector512 zeroIsAscii = utf16Vector & Vector512.Create(asciiMask); + // If a non-ASCII bit is set in any WORD of the vector, we have seen non-ASCII data. + return zeroIsAscii != Vector512.Zero; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 vector) where T : unmanaged { @@ -1639,6 +2112,241 @@ namespace System.Text goto Finish; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_256(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) + { + // This method contains logic optimized using vector instructions for x64 only. + // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support. + + // JIT turns the below into constants + + const nuint MaskOfAllBitsInVector256 = (nuint)(Vector256.Size - 1); + + // This method is written such that control generally flows top-to-bottom, avoiding + // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII + // data, we jump out of the hot paths to targets at the end of the method. + + Debug.Assert(Vector256.IsHardwareAccelerated, "Vector256 is required."); + Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian."); + Debug.Assert(elementCount >= 2 * Vector256.Size); + + // First, perform an unaligned read of the first part of the input buffer. + ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer; + Vector256 utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer); + + // If there's non-ASCII data in the first 16 elements of the vector, there's nothing we can do. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + return 0; + } + + // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination. + + ref byte asciiBuffer = ref *pAsciiBuffer; + Vector256 asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, 0); + nuint currentOffsetInElements = Vector256.Size / 2; // we processed 16 elements so far + + // We're going to get the best performance when we have aligned writes, so we'll take the + // hit of potentially unaligned reads in order to hit this sweet spot. + + // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote + // the 16 bytes previously. If the 0x10 bit is set at the pinned address, then the 16 bytes we wrote + // previously mean that the 0x10 bit is *not* set at address &pAsciiBuffer[SizeOfVector256 / 2]. In + // that case we can immediately back up to the previous aligned boundary and start the main loop. + // If the 0x10 bit is *not* set at the pinned address, then it means the 0x10 bit *is* set at + // address &pAsciiBuffer[SizeOfVector256 / 2], and we should perform one more 16-byte write to bump + // just past the next aligned boundary address. + if (((uint)pAsciiBuffer & (Vector256.Size / 2)) == 0) + { + // We need to perform one more partial vector write before we can get the alignment we want. + + utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // Turn the 16 ASCII chars we just read into 16 ASCII bytes, then copy it to the destination. + asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + } + + // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment + // point, then use that as the base offset going forward. + + currentOffsetInElements = Vector256.Size - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector256); + + Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= Vector256.Size, "We wrote at least 1 byte but no more than a whole vector."); + Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); + Debug.Assert(elementCount - currentOffsetInElements >= Vector256.Size, "We should be able to run at least one whole vector."); + + nuint finalOffsetWhereCanRunLoop = elementCount - Vector256.Size; + do + { + // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. + + utf16VectorFirst = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + Vector256 utf16VectorSecond = Vector256.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + Vector256.Size / sizeof(short)); + Vector256 combinedVector = utf16VectorFirst | utf16VectorSecond; + + if (VectorContainsNonAsciiChar(combinedVector)) + { + goto FoundNonAsciiDataInLoop; + } + + // Build up the ASCII vector and perform the store. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector256.Size == 0, "Write should be aligned."); + asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorSecond); + asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + + currentOffsetInElements += Vector256.Size; + } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); + + Finish: + + // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain. + return currentOffsetInElements; + + FoundNonAsciiDataInLoop: + + // Can we at least narrow the high vector? + // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector128.Size == 0, "Destination should be 128-bit-aligned."); + asciiVector = Vector256.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + currentOffsetInElements += Vector256.Size / 2; + + goto Finish; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) + { + // This method contains logic optimized using vector instructions for x64 only. + // Much of the logic in this method will be elided by JIT once we determine which specific ISAs we support. + + // JIT turns the below into constants + + const nuint MaskOfAllBitsInVector512 = (nuint)(Vector512.Size - 1); + + // This method is written such that control generally flows top-to-bottom, avoiding + // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII + // data, we jump out of the hot paths to targets at the end of the method. + + Debug.Assert(Vector512.IsHardwareAccelerated, "Vector512 is required."); + Debug.Assert(BitConverter.IsLittleEndian, "This implementation assumes little-endian."); + Debug.Assert(elementCount >= 2 * Vector512.Size); + + // First, perform an unaligned read of the first part of the input buffer. + ref ushort utf16Buffer = ref *(ushort*)pUtf16Buffer; + Vector512 utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer); + + // If there's non-ASCII data in the first 32 elements of the vector, there's nothing we can do. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + return 0; + } + + // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. + + ref byte asciiBuffer = ref *pAsciiBuffer; + Vector512 asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, 0); // how to store the lower part of a avx512 + nuint currentOffsetInElements = Vector512.Size / 2; // we processed 32 elements so far + + // We're going to get the best performance when we have aligned writes, so we'll take the + // hit of potentially unaligned reads in order to hit this sweet spot. + + // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote + // the 32 bytes previously. If the 0x20 bit is set at the pinned address, then the 32 bytes we wrote + // previously mean that the 0x20 bit is *not* set at address &pAsciiBuffer[SizeOfVector512 / 2]. In + // that case we can immediately back up to the previous aligned boundary and start the main loop. + // If the 0x20 bit is *not* set at the pinned address, then it means the 0x20 bit *is* set at + // address &pAsciiBuffer[SizeOfVector512 / 2], and we should perform one more 32-byte write to bump + // just past the next aligned boundary address. + + if (((uint)pAsciiBuffer & (Vector512.Size / 2)) == 0) + { + // We need to perform one more partial vector write before we can get the alignment we want. + + utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // Turn the 32 ASCII chars we just read into 32 ASCII bytes, then copy it to the destination. + asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + } + + // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment + // point, then use that as the base offset going forward. + + currentOffsetInElements = Vector512.Size - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector512); + + Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= Vector512.Size, "We wrote at least 1 byte but no more than a whole vector."); + Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); + Debug.Assert(elementCount - currentOffsetInElements >= Vector512.Size, "We should be able to run at least one whole vector."); + + nuint finalOffsetWhereCanRunLoop = elementCount - Vector512.Size; + do + { + // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. + + utf16VectorFirst = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements); + Vector512 utf16VectorSecond = Vector512.LoadUnsafe(ref utf16Buffer, currentOffsetInElements + Vector512.Size / sizeof(short)); + Vector512 combinedVector = utf16VectorFirst | utf16VectorSecond; + + if (VectorContainsNonAsciiChar(combinedVector)) + { + goto FoundNonAsciiDataInLoop; + } + + // Build up the ASCII vector and perform the store. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector512.Size == 0, "Write should be aligned."); + asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorSecond); + asciiVector.StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + + currentOffsetInElements += Vector512.Size; + } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); + + Finish: + + // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain. + return currentOffsetInElements; + + FoundNonAsciiDataInLoop: + + // Can we at least narrow the high vector? + // See comments in GetIndexOfFirstNonAsciiChar_Intrinsified for information about how this works. + if (VectorContainsNonAsciiChar(utf16VectorFirst)) + { + goto Finish; + } + + // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. + + Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % Vector256.Size == 0, "Destination should be 256-bit-aligned."); + asciiVector = Vector512.Narrow(utf16VectorFirst, utf16VectorFirst); + asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + currentOffsetInElements += Vector512.Size / 2; + + goto Finish; + } + /// /// Copies as many ASCII bytes (00..7F) as possible from /// to , stopping when the first non-ASCII byte is encountered @@ -1654,7 +2362,31 @@ namespace System.Text { ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; - if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) + if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512.Count) + { + // Calculating the destination address outside the loop results in significant + // perf wins vs. relying on the JIT to fold memory addressing logic into the + // write instructions. See: https://github.com/dotnet/runtime/issues/33002 + nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512.Count; + + do + { + Vector512 asciiVector = Vector512.Load(pAsciiBuffer + currentOffset); + + if (asciiVector.ExtractMostSignificantBits() != 0) + { + break; + } + + (Vector512 utf16LowVector, Vector512 utf16HighVector) = Vector512.Widen(asciiVector); + utf16LowVector.Store(pCurrentWriteAddress); + utf16HighVector.Store(pCurrentWriteAddress + Vector512.Count); + + currentOffset += (nuint)Vector512.Count; + pCurrentWriteAddress += (nuint)Vector512.Count; + } while (currentOffset <= finalOffsetWhereCanRunLoop); + } + else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) { // Calculating the destination address outside the loop results in significant // perf wins vs. relying on the JIT to fold memory addressing logic into the diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs index 80a7004..2f9186c 100644 --- a/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs +++ b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs @@ -19,15 +19,15 @@ namespace System.Text.Tests [Fact] public static void AllAsciiInput() { - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F. Span utf16Span = utf16Mem.Span; for (int i = 0; i < utf16Span.Length; i++) { - utf16Span[i] = (char)i; + utf16Span[i] = (char)(i % 128); } utf16Mem.MakeReadonly(); @@ -42,11 +42,11 @@ namespace System.Text.Tests // First, validate that the workhorse saw the incoming data as all-ASCII. Assert.Equal(OperationStatus.Done, Ascii.FromUtf16(utf16Span.Slice(i), asciiSpan.Slice(i), out int bytesWritten)); - Assert.Equal(128 - i, bytesWritten); + Assert.Equal(256 - i, bytesWritten); // Then, validate that the data was transcoded properly. - for (int j = i; j < 128; j++) + for (int j = i; j < 256; j++) { Assert.Equal((ushort)utf16Span[i], (ushort)asciiSpan[i]); } @@ -56,15 +56,15 @@ namespace System.Text.Tests [Fact] public static void SomeNonAsciiInput() { - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F. Span utf16Span = utf16Mem.Span; for (int i = 0; i < utf16Span.Length; i++) { - utf16Span[i] = (char)i; + utf16Span[i] = (char)(i % 128); } // We'll write to the ASCII span. diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs index be9c71e..ec5c186 100644 --- a/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs +++ b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs @@ -20,15 +20,15 @@ namespace System.Text.Tests [Fact] public static void AllAsciiInput() { - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F, then trap future writes. Span asciiSpan = asciiMem.Span; for (int i = 0; i < asciiSpan.Length; i++) { - asciiSpan[i] = (byte)i; + asciiSpan[i] = (byte)(i % 128); } asciiMem.MakeReadonly(); @@ -44,11 +44,11 @@ namespace System.Text.Tests // First, validate that the workhorse saw the incoming data as all-ASCII. Assert.Equal(OperationStatus.Done, Ascii.ToUtf16(asciiSpan.Slice(i), utf16Span.Slice(i), out int charsWritten)); - Assert.Equal(128 - i, charsWritten); + Assert.Equal(256 - i, charsWritten); // Then, validate that the data was transcoded properly. - for (int j = i; j < 128; j++) + for (int j = i; j < 256; j++) { Assert.Equal((ushort)asciiSpan[i], (ushort)utf16Span[i]); } @@ -58,15 +58,15 @@ namespace System.Text.Tests [Fact] public static void SomeNonAsciiInput() { - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + using BoundedMemory asciiMem = BoundedMemory.Allocate(256); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(256); // Fill source with 00 .. 7F, then trap future writes. Span asciiSpan = asciiMem.Span; for (int i = 0; i < asciiSpan.Length; i++) { - asciiSpan[i] = (byte)i; + asciiSpan[i] = (byte)(i % 128); } // We'll write to the UTF-16 span. -- 2.7.4