From affc24ab3e7ac6f06eecb1dd9e74c86e7c53104b Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Thu, 11 Apr 2019 13:39:55 -0700 Subject: [PATCH] PR feedback: Clarify 3-byte sequence processing --- .../System/Text/Unicode/Utf8Utility.Helpers.cs | 2 ++ .../System/Text/Unicode/Utf8Utility.Transcoding.cs | 25 ++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs index c17c2cd..ab29fbe 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -20,6 +20,8 @@ namespace System.Text.Unicode [MethodImpl(MethodImplOptions.AggressiveInlining)] private static uint ExtractCharFromFirstThreeByteSequence(uint value) { + Debug.Assert(UInt32BeginsWithUtf8ThreeByteMask(value)); + if (BitConverter.IsLittleEndian) { // value = [ ######## | 10xxxxxx 10yyyyyy 1110zzzz ] diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs index 9ee331c..fbfc2ce 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -493,18 +493,31 @@ namespace System.Text.Unicode if (Bmi2.X64.IsSupported) { Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian."); + + // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which + // would indicate the potential start of a second three-byte sequence. + if (((thisDWord - 0xE000_0000u) & 0xF000_0000u) == 0) { - if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 7) + // The const '3' below is correct because pFinalPosWhereCanReadDWordFromInputBuffer represents + // the final place where we can safely perform a DWORD read, and we want to probe whether it's + // safe to read a DWORD beginning at address &pInputBuffer[3]. + + if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3) { // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT. - - uint nextDWord = Unsafe.ReadUnaligned(pInputBuffer + 3); - if (((nextDWord & 0x0000_200Fu) != 0) && (((nextDWord - 0x0000_200Du) & 0x0000_200Fu) != 0)) + // Since we already validated the first byte of the second DWORD (it's the same as the final byte of the + // first DWORD), the only checks that remain are the overlong + surrogate checks. If the overlong or surrogate + // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid + // 3-byte UTF-8 sequence we read; and on the next iteration of the loop the validation routine will run again, + // fail, and redirect control flow to the error handling logic at the very end of this method. + + uint secondDWord = Unsafe.ReadUnaligned(pInputBuffer + 3); + if (((secondDWord & 0x0000_200Fu) != 0) && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0)) { // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD - ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(nextDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord); - thisDWord = nextDWord; // store this value in the correct local for the ASCII drain logic + ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord); + thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ] ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul); -- 2.7.4