Port dotnet/runtime#31904 to release/3.1 (#28013)
authorLevi Broderick <GrabYourPitchforks@users.noreply.github.com>
Tue, 18 Feb 2020 18:06:38 +0000 (10:06 -0800)
committerHyungju Lee <leee.lee@samsung.com>
Fri, 30 Oct 2020 09:20:49 +0000 (18:20 +0900)
Remove BMI2 from ASCII and UTF-16 processing hot paths, as not all processors have optimized implementations of pext/pdep

src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs
src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs

index b37b017..bf2bccf 100644 (file)
@@ -1009,10 +1009,14 @@ namespace System.Text
         {
             Debug.Assert(AllCharsInUInt64AreAscii(value));
 
-            if (Bmi2.X64.IsSupported)
+            if (Sse2.X64.IsSupported)
             {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
+                // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes
+                // [ b0 b1 b2 b3 b0 b1 b2 b3 ], then writes 4 bytes (32 bits) to the destination.
+
+                Vector128<short> vecWide = Sse2.X64.ConvertScalarToVector128UInt64(value).AsInt16();
+                Vector128<uint> vecNarrow = Sse2.PackUnsignedSaturate(vecWide, vecWide).AsUInt32();
+                Unsafe.WriteUnaligned<uint>(ref outputBuffer, Sse2.ConvertToUInt32(vecNarrow));
             }
             else
             {
@@ -1694,14 +1698,16 @@ namespace System.Text
         /// writes them to the output buffer with machine endianness.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
+        internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
         {
             Debug.Assert(AllBytesInUInt32AreAscii(value));
 
-            if (Bmi2.X64.IsSupported)
+            if (Sse2.X64.IsSupported)
             {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
+                Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian.");
+                Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
+                Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
+                Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide));
             }
             else
             {
index 54940f9..9885a30 100644 (file)
@@ -6,7 +6,6 @@ using System.Buffers.Binary;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics.X86;
 using Internal.Runtime.CompilerServices;
 
 namespace System.Text.Unicode
@@ -61,7 +60,7 @@ namespace System.Text.Unicode
         }
 
         /// <summary>
-        /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a
+        /// Given a machine-endian DWORD which represents four bytes of UTF-8 data, interprets the input as a
         /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -69,39 +68,19 @@ namespace System.Text.Unicode
         {
             if (BitConverter.IsLittleEndian)
             {
-                if (Bmi2.IsSupported)
-                {
-                    // need to reverse endianness for bit manipulation to work correctly
-                    value = BinaryPrimitives.ReverseEndianness(value);
-
-                    // value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
-                    // want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ]
-                    // where wwww = uuuuu - 1
-
-                    uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u);
-                    uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u);
-
-                    uint combined = (lowSurrogateChar << 16) + highSurrogateChar;
-                    combined -= 0x40u; // wwww = uuuuu - 1
-                    combined += 0xDC00_D800u; // add surrogate markers
-                    return combined;
-                }
-                else
-                {
-                    // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
-                    // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
-                    // where wwww = uuuuu - 1
-                    uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
-                    retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
-                    retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
-                    retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
-                    retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
-                    retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
-                    retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
-                    retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
-                    retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
-                    return retVal;
-                }
+                // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
+                // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+                // where wwww = uuuuu - 1
+                uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
+                retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
+                retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
+                retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
+                retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
+                retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
+                retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
+                retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
+                retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
+                return retVal;
             }
             else
             {
@@ -135,37 +114,19 @@ namespace System.Text.Unicode
                 // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
                 // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1
 
-                if (Bmi2.IsSupported)
-                {
-                    // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want
-                    // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple
-                    // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across
-                    // all four output bytes.
-
-                    uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */;
-
-                    // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning
-                    // that should normally be masked out via an and, but we'll just direct pdep to ignore it.
+                value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
 
-                    uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ]
-                    return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
-                }
-                else
-                {
-                    value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
+                uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
+                tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
 
-                    uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
-                    tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
+                uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
+                uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
+                tempC |= tempB;
 
-                    uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
-                    uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
-                    tempC |= tempB;
+                uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
+                tempD |= 0x8080_80F0u;
 
-                    uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
-                    tempD |= 0x8080_80F0u;
-
-                    return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
-                }
+                return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
             }
             else
             {
@@ -757,43 +718,6 @@ namespace System.Text.Unicode
         }
 
         /// <summary>
-        /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD
-        /// and writes the resulting QWORD into the destination with machine endianness.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
-        {
-            if (Bmi2.X64.IsSupported)
-            {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
-            }
-            else
-            {
-                if (BitConverter.IsLittleEndian)
-                {
-                    outputBuffer = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 3) = (char)value;
-                }
-                else
-                {
-                    Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    outputBuffer = (char)value;
-                }
-            }
-        }
-
-        /// <summary>
         /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
         /// converts those scalar values to their 3-byte UTF-8 representation and writes the
         /// resulting 6 bytes to the destination buffer.
index 126974c..f050248 100644 (file)
@@ -7,6 +7,7 @@ using System.Buffers.Binary;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 using Internal.Runtime.CompilerServices;
 
@@ -78,7 +79,8 @@ namespace System.Text.Unicode
             byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
 #endif
 
-            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+            Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+            do
             {
                 // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
 
@@ -101,7 +103,7 @@ namespace System.Text.Unicode
                         goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
                     }
 
-                    Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+                    ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
                     pInputBuffer += 4;
                     pOutputBuffer += 4;
                     outputCharsRemaining -= 4;
@@ -127,8 +129,8 @@ namespace System.Text.Unicode
 
                         pInputBuffer += 8;
 
-                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
-                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord);
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord);
 
                         pOutputBuffer += 8;
                     }
@@ -143,7 +145,7 @@ namespace System.Text.Unicode
                     {
                         // The first DWORD contained all-ASCII bytes, so expand it.
 
-                        Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
 
                         // continue the outer loop from the second DWORD
 
@@ -487,12 +489,10 @@ namespace System.Text.Unicode
                     }
 
                     // As an optimization, on compatible platforms check if a second three-byte sequence immediately
-                    // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
+                    // follows the one we just read, and if so extract them together.
 
-                    if (Bmi2.X64.IsSupported)
+                    if (BitConverter.IsLittleEndian)
                     {
-                        Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
-
                         // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
                         // would indicate the potential start of a second three-byte sequence.
 
@@ -504,7 +504,7 @@ namespace System.Text.Unicode
 
                             if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
                             {
-                                // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
+                                // We're going to attempt to read a second 3-byte sequence and write them both out one after the other.
                                 // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
                                 // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
                                 // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
@@ -517,14 +517,8 @@ namespace System.Text.Unicode
                                     && ((secondDWord & 0x0000_200Fu) != 0)
                                     && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
                                 {
-                                    // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
-                                    ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
-                                    thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic
-
-                                    // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
-                                    ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);
-
-                                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord);
+                                    pOutputBuffer[0] = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
+                                    pOutputBuffer[1] = (char)ExtractCharFromFirstThreeByteSequence(secondDWord);
                                     pInputBuffer += 6;
                                     pOutputBuffer += 2;
                                     outputCharsRemaining -= 2;
@@ -658,7 +652,7 @@ namespace System.Text.Unicode
 
                     continue; // go back to beginning of loop for processing
                 }
-            }
+            } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
 
         ProcessRemainingBytesSlow:
             inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
@@ -900,6 +894,16 @@ namespace System.Text.Unicode
 
             char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
 
+            // We have paths for SSE4.1 vectorization inside the inner loop. Since the below
+            // vector is only used in those code paths, we leave it uninitialized if SSE4.1
+            // is not enabled.
+
+            Vector128<short> nonAsciiUtf16DataMask = default;
+            if (Sse41.X64.IsSupported)
+            {
+                nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char
+            }
+
             // Begin the main loop.
 
 #if DEBUG
@@ -908,7 +912,8 @@ namespace System.Text.Unicode
 
             uint thisDWord;
 
-            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+            Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+            do
             {
                 // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
 
@@ -952,27 +957,26 @@ namespace System.Text.Unicode
                     uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
                     uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
 
-                    if (Bmi2.X64.IsSupported)
+                    if (Sse41.X64.IsSupported)
                     {
-                        Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
-                        const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul;
+                        Debug.Assert(BitConverter.IsLittleEndian, "SSE41 requires little-endian.");
 
                         // Try reading and writing 8 elements per iteration.
                         uint maxIters = minElementsRemaining / 8;
-                        ulong firstQWord, secondQWord;
+                        ulong possibleNonAsciiQWord;
                         int i;
+                        Vector128<short> utf16Data;
                         for (i = 0; (uint)i < maxIters; i++)
                         {
-                            firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4);
-
-                            if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord))
+                            utf16Data = Unsafe.ReadUnaligned<Vector128<short>>(pInputBuffer);
+                            if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask))
                             {
-                                goto LoopTerminatedDueToNonAsciiData;
+                                goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
                             }
 
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+                            // narrow and write
+
+                            Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64());
 
                             pInputBuffer += 8;
                             pOutputBuffer += 8;
@@ -984,14 +988,14 @@ namespace System.Text.Unicode
 
                         if ((minElementsRemaining & 4) != 0)
                         {
-                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-
-                            if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord))
+                            possibleNonAsciiQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+                            if (!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord))
                             {
-                                goto LoopTerminatedDueToNonAsciiDataInSecondQWord;
+                                goto LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal;
                             }
 
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
+                            utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16();
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
 
                             pInputBuffer += 4;
                             pOutputBuffer += 4;
@@ -1000,29 +1004,31 @@ namespace System.Text.Unicode
 
                         continue; // Go back to beginning of main loop, read data, check for ASCII
 
-                    LoopTerminatedDueToNonAsciiData:
+                    LoopTerminatedDueToNonAsciiDataInVectorLocal:
 
                         outputBytesRemaining -= 8 * i;
+                        possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64());
 
-                        // First, see if we can drain any ASCII data from the first QWORD.
+                        // Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector,
+                        // then check whether it's all-ASCII. If so, narrow and write to the destination
+                        // buffer. Since we know that either the high 64 bits or the low 64 bits of the
+                        // vector contains non-ASCII data, by the end of the following block the
+                        // 'possibleNonAsciiQWord' local is guaranteed to contain the non-ASCII segment.
 
-                        if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord))
+                        if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII
                         {
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
+                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
                             pInputBuffer += 4;
                             pOutputBuffer += 4;
                             outputBytesRemaining -= 4;
-                        }
-                        else
-                        {
-                            secondQWord = firstQWord;
+                            possibleNonAsciiQWord = utf16Data.AsUInt64().GetElement(1);
                         }
 
-                    LoopTerminatedDueToNonAsciiDataInSecondQWord:
+                    LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal:
 
-                        Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier
+                        Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)); // this condition should've been checked earlier
 
-                        thisDWord = (uint)secondQWord;
+                        thisDWord = (uint)possibleNonAsciiQWord;
                         if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
                         {
                             // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
@@ -1030,14 +1036,14 @@ namespace System.Text.Unicode
                             pInputBuffer += 2;
                             pOutputBuffer += 2;
                             outputBytesRemaining -= 2;
-                            thisDWord = (uint)(secondQWord >> 32);
+                            thisDWord = (uint)(possibleNonAsciiQWord >> 32);
                         }
 
                         goto AfterReadDWordSkipAllCharsAsciiCheck;
                     }
                     else
                     {
-                        // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
+                        // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration.
                         uint maxIters = minElementsRemaining / 4;
                         uint secondDWord;
                         int i;
@@ -1358,7 +1364,7 @@ namespace System.Text.Unicode
                 }
 
                 goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
-            }
+            } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
 
         ProcessNextCharAndFinish:
             inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;