Port dotnet/runtime#31904 to release/3.1 (#28013)

author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>

Tue, 18 Feb 2020 18:06:38 +0000 (10:06 -0800)

committer Hyungju Lee <leee.lee@samsung.com>

Fri, 30 Oct 2020 09:20:49 +0000 (18:20 +0900)
author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Tue, 18 Feb 2020 18:06:38 +0000 (10:06 -0800)
committer Hyungju Lee <leee.lee@samsung.com>
Fri, 30 Oct 2020 09:20:49 +0000 (18:20 +0900)
diff --git a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs

index b37b017..bf2bccf 100644 (file)
--- a/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
@@ -1009,10 +1009,14 @@ namespace System.Text
          {
              Debug.Assert(AllCharsInUInt64AreAscii(value));
  
-            if (Bmi2.X64.IsSupported)
+            if (Sse2.X64.IsSupported)
              {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
+                // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes
+                // [ b0 b1 b2 b3 b0 b1 b2 b3 ], then writes 4 bytes (32 bits) to the destination.
+
+                Vector128<short> vecWide = Sse2.X64.ConvertScalarToVector128UInt64(value).AsInt16();
+                Vector128<uint> vecNarrow = Sse2.PackUnsignedSaturate(vecWide, vecWide).AsUInt32();
+                Unsafe.WriteUnaligned<uint>(ref outputBuffer, Sse2.ConvertToUInt32(vecNarrow));
              }
              else
              {
@@ -1694,14 +1698,16 @@ namespace System.Text
          /// writes them to the output buffer with machine endianness.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
+        internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
          {
              Debug.Assert(AllBytesInUInt32AreAscii(value));
  
-            if (Bmi2.X64.IsSupported)
+            if (Sse2.X64.IsSupported)
              {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
+                Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian.");
+                Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
+                Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
+                Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide));
              }
              else
              {
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs

index 54940f9..9885a30 100644 (file)
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
@@ -6,7 +6,6 @@ using System.Buffers.Binary;
  using System.Diagnostics;
  using System.Numerics;
  using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics.X86;
  using Internal.Runtime.CompilerServices;
  
  namespace System.Text.Unicode
@@ -61,7 +60,7 @@ namespace System.Text.Unicode
          }
  
          /// <summary>
-        /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a
+        /// Given a machine-endian DWORD which represents four bytes of UTF-8 data, interprets the input as a
          /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -69,39 +68,19 @@ namespace System.Text.Unicode
          {
              if (BitConverter.IsLittleEndian)
              {
-                if (Bmi2.IsSupported)
-                {
-                    // need to reverse endianness for bit manipulation to work correctly
-                    value = BinaryPrimitives.ReverseEndianness(value);
-
-                    // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
-                    // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ]
-                    // where wwww = uuuuu - 1
-
-                    uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u);
-                    uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u);
-
-                    uint combined = (lowSurrogateChar << 16) + highSurrogateChar;
-                    combined -= 0x40u; // wwww = uuuuu - 1
-                    combined += 0xDC00_D800u; // add surrogate markers
-                    return combined;
-                }
-                else
-                {
-                    // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
-                    // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
-                    // where wwww = uuuuu - 1
-                    uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
-                    retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
-                    retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
-                    retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
-                    retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
-                    retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
-                    retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
-                    retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
-                    retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
-                    return retVal;
-                }
+                // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
+                // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+                // where wwww = uuuuu - 1
+                uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
+                retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
+                retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
+                retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
+                retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
+                retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
+                retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
+                retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
+                retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+                return retVal;
              }
              else
              {
@@ -135,37 +114,19 @@ namespace System.Text.Unicode
                  // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
                  // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1
  
-                if (Bmi2.IsSupported)
-                {
-                    // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want
-                    // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple
-                    // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across
-                    // all four output bytes.
-
-                    uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */;
-
-                    // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning
-                    // that should normally be masked out via an and, but we'll just direct pdep to ignore it.
+                value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
  
-                    uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ]
-                    return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
-                }
-                else
-                {
-                    value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
+                uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
+                tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
  
-                    uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
-                    tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
+                uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
+                uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
+                tempC |= tempB;
  
-                    uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
-                    uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
-                    tempC |= tempB;
+                uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
+                tempD |= 0x8080_80F0u;
  
-                    uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
-                    tempD |= 0x8080_80F0u;
-
-                    return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
-                }
+                return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
              }
              else
              {
@@ -757,43 +718,6 @@ namespace System.Text.Unicode
          }
  
          /// <summary>
-        /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD
-        /// and writes the resulting QWORD into the destination with machine endianness.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
-        {
-            if (Bmi2.X64.IsSupported)
-            {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
-            }
-            else
-            {
-                if (BitConverter.IsLittleEndian)
-                {
-                    outputBuffer = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 3) = (char)value;
-                }
-                else
-                {
-                    Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    outputBuffer = (char)value;
-                }
-            }
-        }
-
-        /// <summary>
          /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
          /// converts those scalar values to their 3-byte UTF-8 representation and writes the
          /// resulting 6 bytes to the destination buffer.
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs

index 126974c..f050248 100644 (file)
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs
@@ -7,6 +7,7 @@ using System.Buffers.Binary;
  using System.Diagnostics;
  using System.Numerics;
  using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
  using System.Runtime.Intrinsics.X86;
  using Internal.Runtime.CompilerServices;
  
@@ -78,7 +79,8 @@ namespace System.Text.Unicode
              byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
  #endif
  
-            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+            Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+            do
              {
                  // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
  
@@ -101,7 +103,7 @@ namespace System.Text.Unicode
                          goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
                      }
  
-                    Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+                    ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
                      pInputBuffer += 4;
                      pOutputBuffer += 4;
                      outputCharsRemaining -= 4;
@@ -127,8 +129,8 @@ namespace System.Text.Unicode
  
                          pInputBuffer += 8;
  
-                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
-                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord);
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord);
  
                          pOutputBuffer += 8;
                      }
@@ -143,7 +145,7 @@ namespace System.Text.Unicode
                      {
                          // The first DWORD contained all-ASCII bytes, so expand it.
  
-                        Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
  
                          // continue the outer loop from the second DWORD
  
@@ -487,12 +489,10 @@ namespace System.Text.Unicode
                      }
  
                      // As an optimization, on compatible platforms check if a second three-byte sequence immediately
-                    // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
+                    // follows the one we just read, and if so extract them together.
  
-                    if (Bmi2.X64.IsSupported)
+                    if (BitConverter.IsLittleEndian)
                      {
-                        Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
-
                          // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
                          // would indicate the potential start of a second three-byte sequence.
  
@@ -504,7 +504,7 @@ namespace System.Text.Unicode
  
                              if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
                              {
-                                // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
+                                // We're going to attempt to read a second 3-byte sequence and write them both out one after the other.
                                  // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
                                  // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
                                  // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
@@ -517,14 +517,8 @@ namespace System.Text.Unicode
                                      && ((secondDWord & 0x0000_200Fu) != 0)
                                      && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
                                  {
-                                    // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
-                                    ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
-                                    thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic
-
-                                    // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
-                                    ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);
-
-                                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord);
+                                    pOutputBuffer[0] = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
+                                    pOutputBuffer[1] = (char)ExtractCharFromFirstThreeByteSequence(secondDWord);
                                      pInputBuffer += 6;
                                      pOutputBuffer += 2;
                                      outputCharsRemaining -= 2;
@@ -658,7 +652,7 @@ namespace System.Text.Unicode
  
                      continue; // go back to beginning of loop for processing
                  }
-            }
+            } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
  
          ProcessRemainingBytesSlow:
              inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
@@ -900,6 +894,16 @@ namespace System.Text.Unicode
  
              char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
  
+            // We have paths for SSE4.1 vectorization inside the inner loop. Since the below
+            // vector is only used in those code paths, we leave it uninitialized if SSE4.1
+            // is not enabled.
+
+            Vector128<short> nonAsciiUtf16DataMask = default;
+            if (Sse41.X64.IsSupported)
+            {
+                nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char
+            }
+
              // Begin the main loop.
  
  #if DEBUG
@@ -908,7 +912,8 @@ namespace System.Text.Unicode
  
              uint thisDWord;
  
-            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+            Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+            do
              {
                  // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
  
@@ -952,27 +957,26 @@ namespace System.Text.Unicode
                      uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
                      uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
  
-                    if (Bmi2.X64.IsSupported)
+                    if (Sse41.X64.IsSupported)
                      {
-                        Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
-                        const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul;
+                        Debug.Assert(BitConverter.IsLittleEndian, "SSE41 requires little-endian.");
  
                          // Try reading and writing 8 elements per iteration.
                          uint maxIters = minElementsRemaining / 8;
-                        ulong firstQWord, secondQWord;
+                        ulong possibleNonAsciiQWord;
                          int i;
+                        Vector128<short> utf16Data;
                          for (i = 0; (uint)i < maxIters; i++)
                          {
-                            firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4);
-
-                            if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord))
+                            utf16Data = Unsafe.ReadUnaligned<Vector128<short>>(pInputBuffer);
+                            if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask))
                              {
-                                goto LoopTerminatedDueToNonAsciiData;
+                                goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
                              }
  
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+                            // narrow and write
+
+                            Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64());
  
                              pInputBuffer += 8;
                              pOutputBuffer += 8;
@@ -984,14 +988,14 @@ namespace System.Text.Unicode
  
                          if ((minElementsRemaining & 4) != 0)
                          {
-                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-
-                            if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord))
+                            possibleNonAsciiQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+                            if (!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord))
                              {
-                                goto LoopTerminatedDueToNonAsciiDataInSecondQWord;
+                                goto LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal;
                              }
  
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+                            utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16();
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
  
                              pInputBuffer += 4;
                              pOutputBuffer += 4;
@@ -1000,29 +1004,31 @@ namespace System.Text.Unicode
  
                          continue; // Go back to beginning of main loop, read data, check for ASCII
  
-                    LoopTerminatedDueToNonAsciiData:
+                    LoopTerminatedDueToNonAsciiDataInVectorLocal:
  
                          outputBytesRemaining -= 8 * i;
+                        possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64());
  
-                        // First, see if we can drain any ASCII data from the first QWORD.
+                        // Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector,
+                        // then check whether it's all-ASCII. If so, narrow and write to the destination
+                        // buffer. Since we know that either the high 64 bits or the low 64 bits of the
+                        // vector contains non-ASCII data, by the end of the following block the
+                        // 'possibleNonAsciiQWord' local is guaranteed to contain the non-ASCII segment.
  
-                        if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord))
+                        if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII
                          {
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
                              pInputBuffer += 4;
                              pOutputBuffer += 4;
                              outputBytesRemaining -= 4;
-                        }
-                        else
-                        {
-                            secondQWord = firstQWord;
+                            possibleNonAsciiQWord = utf16Data.AsUInt64().GetElement(1);
                          }
  
-                    LoopTerminatedDueToNonAsciiDataInSecondQWord:
+                    LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal:
  
-                        Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier
+                        Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)); // this condition should've been checked earlier
  
-                        thisDWord = (uint)secondQWord;
+                        thisDWord = (uint)possibleNonAsciiQWord;
                          if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
                          {
                              // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
@@ -1030,14 +1036,14 @@ namespace System.Text.Unicode
                              pInputBuffer += 2;
                              pOutputBuffer += 2;
                              outputBytesRemaining -= 2;
-                            thisDWord = (uint)(secondQWord >> 32);
+                            thisDWord = (uint)(possibleNonAsciiQWord >> 32);
                          }
  
                          goto AfterReadDWordSkipAllCharsAsciiCheck;
                      }
                      else
                      {
-                        // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
+                        // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration.
                          uint maxIters = minElementsRemaining / 4;
                          uint secondDWord;
                          int i;
@@ -1358,7 +1364,7 @@ namespace System.Text.Unicode
                  }
  
                  goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
-            }
+            } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
  
          ProcessNextCharAndFinish:
              inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;
author	Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
	Tue, 18 Feb 2020 18:06:38 +0000 (10:06 -0800)
committer	Hyungju Lee <leee.lee@samsung.com>
	Fri, 30 Oct 2020 09:20:49 +0000 (18:20 +0900)
src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs		patch \| blob \| history
src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs		patch \| blob \| history
src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs		patch \| blob \| history