Re-flow Utf8Utility.Helpers

author Levi Broderick <levib@microsoft.com>

Wed, 10 Apr 2019 23:52:48 +0000 (16:52 -0700)

committer Levi Broderick <levib@microsoft.com>

Wed, 10 Apr 2019 23:52:48 +0000 (16:52 -0700)
author Levi Broderick <levib@microsoft.com>
Wed, 10 Apr 2019 23:52:48 +0000 (16:52 -0700)
committer Levi Broderick <levib@microsoft.com>
Wed, 10 Apr 2019 23:52:48 +0000 (16:52 -0700)
diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs

index b119922..c17c2cd 100644 (file)
--- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs
@@ -14,279 +14,6 @@ namespace System.Text.Unicode
      internal static partial class Utf8Utility
      {
          /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the first two bytes of the buffer are
-        /// an overlong representation of a sequence that should be represented as one byte.
-        /// This method *does not* validate that the sequence matches the appropriate
-        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
-        {
-            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
-            Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));
-
-            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
-            // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need
-            // to check that it's < C2.
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u))
-                || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the first four bytes of the buffer match
-        /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This
-        /// method *does not* validate that the sequence is well-formed; the caller must
-        /// still perform overlong form or out-of-range checking.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
-        {
-            // The code in this method is equivalent to the code
-            // below but is slightly more optimized.
-            //
-            // if (BitConverter.IsLittleEndian)
-            // {
-            //     const uint mask = 0xC0C0C0F8U;
-            //     const uint comparand = 0x808080F0U;
-            //     return ((value & mask) == comparand);
-            // }
-            // else
-            // {
-            //     const uint mask = 0xF8C0C0C0U;
-            //     const uint comparand = 0xF0808000U;
-            //     return ((value & mask) == comparand);
-            // }
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
-                || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the first three bytes of the buffer match
-        /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not*
-        /// validate that the sequence is well-formed; the caller must still perform
-        /// overlong form or surrogate checking.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
-        {
-            // The code in this method is equivalent to the code
-            // below but is slightly more optimized.
-            //
-            // if (BitConverter.IsLittleEndian)
-            // {
-            //     const uint mask = 0x00C0C0F0U;
-            //     const uint comparand = 0x008080E0U;
-            //     return ((value & mask) == comparand);
-            // }
-            // else
-            // {
-            //     const uint mask = 0xF0C0C000U;
-            //     const uint comparand = 0xE0808000U;
-            //     return ((value & mask) == comparand);
-            // }
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0))
-                || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the first two bytes of the buffer match
-        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
-        /// validate that the sequence is well-formed; the caller must still perform
-        /// overlong form checking.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
-        {
-            // The code in this method is equivalent to the code
-            // below but is slightly more optimized.
-            //
-            // if (BitConverter.IsLittleEndian)
-            // {
-            //     const uint mask = 0x0000C0E0U;
-            //     const uint comparand = 0x000080C0U;
-            //     return ((value & mask) == comparand);
-            // }
-            // else
-            // {
-            //     const uint mask = 0xE0C00000U;
-            //     const uint comparand = 0xC0800000U;
-            //     return ((value & mask) == comparand);
-            // }
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0))
-                || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the first two bytes of the buffer are
-        /// an overlong representation of a sequence that should be represented as one byte.
-        /// This method *does not* validate that the sequence matches the appropriate
-        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
-        {
-            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
-            Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));
-
-            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
-            // We already validated that it's 80 .. DF (per mask check earlier).
-            // C2 = 1100 0010
-            // DF = 1101 1111
-            // This means that we can AND the leading byte with the mask 0001 1110 (1E),
-            // and if the result is zero the sequence is overlong.
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0))
-                || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the last two bytes of the buffer match
-        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
-        /// validate that the sequence is well-formed; the caller must still perform
-        /// overlong form checking.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
-        {
-            // The code in this method is equivalent to the code
-            // below but is slightly more optimized.
-            //
-            // if (BitConverter.IsLittleEndian)
-            // {
-            //     const uint mask = 0xC0E00000U;
-            //     const uint comparand = 0x80C00000U;
-            //     return ((value & mask) == comparand);
-            // }
-            // else
-            // {
-            //     const uint mask = 0x0000E0C0U;
-            //     const uint comparand = 0x0000C080U;
-            //     return ((value & mask) == comparand);
-            // }
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0))
-                || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
-        /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed
-        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
-        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
-        {
-            // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
-            // In little-endian, that would be represented as:
-            // [ ######## ######## 10xxxxxx 110yyyyy ].
-            // Due to the little-endian representation we can perform a trick by ANDing the low
-            // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within
-            // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the
-            // 2-byte-sequence bitmask check and overlong form validation with one comparison.
-
-            Debug.Assert(BitConverter.IsLittleEndian);
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu))
-                || (!BitConverter.IsLittleEndian && false);
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
-        /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed
-        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
-        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
-        {
-            // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.
-
-            Debug.Assert(BitConverter.IsLittleEndian);
-
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u))
-                || (!BitConverter.IsLittleEndian && false);
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32FirstByteIsAscii(uint value)
-        {
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0))
-                || (!BitConverter.IsLittleEndian && ((int)value >= 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32FourthByteIsAscii(uint value)
-        {
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && ((int)value >= 0))
-                || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32SecondByteIsAscii(uint value)
-        {
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0))
-                || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0));
-        }
-
-        /// <summary>
-        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
-        /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool UInt32ThirdByteIsAscii(uint value)
-        {
-            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
-            return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0))
-                || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
-        }
-
-        /// <summary>
          /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
          /// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation.
          /// </summary>
@@ -567,191 +294,464 @@ namespace System.Text.Unicode
          /// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated).
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
+        private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
+            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0)
+                || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsFirstCharSurrogate(uint value)
+        {
+            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
+            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0)
+                || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsFirstCharTwoUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
+            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].
+
+            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian
+            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u)
+                || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu));
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
+        /// is a UTF-8 continuation byte.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsLowByteUtf8ContinuationByte(uint value)
+        {
+            // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
+            // so the best we can do for now is the lea / cmp pair.
+            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+            return (byte)(value - 0x80u) <= 0x3Fu;
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character is ASCII.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharAscii(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
+            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && value < 0x0080_0000u)
+                || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
+        /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
+            // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0)
+                || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharSurrogate(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
+            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u)
+                || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsSecondCharTwoUtf8Bytes(uint value)
+        {
+            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
+            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].
+
+            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian
+            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu))
+                || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u);
+        }
+
+        /// <summary>
+        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
+        /// i.e., has binary representation 10xxxxxx, where x is any bit.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsUtf8ContinuationByte(in byte value)
+        {
+            // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
+            // directly rather than bounce a temporary through a register. That is, we want the JIT to be
+            // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
+            // to see if it's a continuation byte. Data that's already enregistered will go through the
+            // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
+            //
+            // The below check takes advantage of the two's complement representation of negative numbers.
+            // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
+
+            return ((sbyte)value < -64);
+        }
+
+        /// <summary>
+        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+        /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool IsWellFormedUtf16SurrogatePair(uint value)
+        {
+            // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
+            // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
+            //
+            // We're essentially performing a range check on each component of the input in parallel. The allowed range
+            // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We
+            // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400
+            // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can*
+            // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc".
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0)
+                || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0);
+        }
+
+        /// <summary>
+        /// Converts a DWORD from machine-endian to little-endian.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint ToLittleEndian(uint value)
+        {
+            if (BitConverter.IsLittleEndian)
+            {
+                return value;
+            }
+            else
+            {
+                return BinaryPrimitives.ReverseEndianness(value);
+            }
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer are
+        /// an overlong representation of a sequence that should be represented as one byte.
+        /// This method *does not* validate that the sequence matches the appropriate
+        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
+        {
+            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+            Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));
+
+            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+            // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need
+            // to check that it's < C2.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u))
+                || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u));
+        }
+
+        /// <summary>
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first four bytes of the buffer match
+        /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This
+        /// method *does not* validate that the sequence is well-formed; the caller must
+        /// still perform overlong form or out-of-range checking.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
          {
-            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
-            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0xC0C0C0F8U;
+            //     const uint comparand = 0x808080F0U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0xF8C0C0C0U;
+            //     const uint comparand = 0xF0808000U;
+            //     return ((value & mask) == comparand);
+            // }
  
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0)
-                || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u);
+            return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
          }
  
          /// <summary>
-        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
-        /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first three bytes of the buffer match
+        /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not*
+        /// validate that the sequence is well-formed; the caller must still perform
+        /// overlong form or surrogate checking.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsFirstCharSurrogate(uint value)
+        private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
          {
-            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
-            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0x00C0C0F0U;
+            //     const uint comparand = 0x008080E0U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0xF0C0C000U;
+            //     const uint comparand = 0xE0808000U;
+            //     return ((value & mask) == comparand);
+            // }
  
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0)
-                || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u);
+            return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0));
          }
  
          /// <summary>
-        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
-        /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer match
+        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+        /// validate that the sequence is well-formed; the caller must still perform
+        /// overlong form checking.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsFirstCharTwoUtf8Bytes(uint value)
+        private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
          {
-            // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
-            // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].
-
-            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian
-            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
-            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0x0000C0E0U;
+            //     const uint comparand = 0x000080C0U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0xE0C00000U;
+            //     const uint comparand = 0xC0800000U;
+            //     return ((value & mask) == comparand);
+            // }
  
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u)
-                || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu));
+            return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0));
          }
  
          /// <summary>
-        /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
-        /// is a UTF-8 continuation byte.
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer are
+        /// an overlong representation of a sequence that should be represented as one byte.
+        /// This method *does not* validate that the sequence matches the appropriate
+        /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsLowByteUtf8ContinuationByte(uint value)
+        private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
          {
-            // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
-            // so the best we can do for now is the lea / cmp pair.
-            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+            // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+            Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));
  
-            return (byte)(value - 0x80u) <= 0x3Fu;
+            // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+            // We already validated that it's 80 .. DF (per mask check earlier).
+            // C2 = 1100 0010
+            // DF = 1101 1111
+            // This means that we can AND the leading byte with the mask 0001 1110 (1E),
+            // and if the result is zero the sequence is overlong.
+
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0));
          }
  
          /// <summary>
-        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
-        /// returns true iff the second UTF-16 character is ASCII.
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the last two bytes of the buffer match
+        /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+        /// validate that the sequence is well-formed; the caller must still perform
+        /// overlong form checking.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsSecondCharAscii(uint value)
+        private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
          {
-            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
-            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].
+            // The code in this method is equivalent to the code
+            // below but is slightly more optimized.
+            //
+            // if (BitConverter.IsLittleEndian)
+            // {
+            //     const uint mask = 0xC0E00000U;
+            //     const uint comparand = 0x80C00000U;
+            //     return ((value & mask) == comparand);
+            // }
+            // else
+            // {
+            //     const uint mask = 0x0000E0C0U;
+            //     const uint comparand = 0x0000C080U;
+            //     return ((value & mask) == comparand);
+            // }
  
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && value < 0x0080_0000u)
-                || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0);
+            return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0))
+                || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0));
          }
  
          /// <summary>
-        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
-        /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
-        /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
+        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+        /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed
+        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
+        private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
          {
-            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
-            // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].
+            // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
+            // In little-endian, that would be represented as:
+            // [ ######## ######## 10xxxxxx 110yyyyy ].
+            // Due to the little-endian representation we can perform a trick by ANDing the low
+            // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within
+            // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the
+            // 2-byte-sequence bitmask check and overlong form validation with one comparison.
+
+            Debug.Assert(BitConverter.IsLittleEndian);
  
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0)
-                || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0);
+            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu))
+                || (!BitConverter.IsLittleEndian && false);
          }
  
          /// <summary>
-        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
-        /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
+        /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+        /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed
+        /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+        /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsSecondCharSurrogate(uint value)
+        private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
          {
-            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
-            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].
+            // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.
+
+            Debug.Assert(BitConverter.IsLittleEndian);
  
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u)
-                || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0);
+            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u))
+                || (!BitConverter.IsLittleEndian && false);
          }
  
          /// <summary>
-        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
-        /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsSecondCharTwoUtf8Bytes(uint value)
+        private static bool UInt32FirstByteIsAscii(uint value)
          {
-            // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
-            // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].
-
-            // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian
-            // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
-            // Tracked as https://github.com/dotnet/coreclr/issues/18022.
-
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu))
-                || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u);
+            return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0))
+                || (!BitConverter.IsLittleEndian && ((int)value >= 0));
          }
  
          /// <summary>
-        /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
-        /// i.e., has binary representation 10xxxxxx, where x is any bit.
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsUtf8ContinuationByte(in byte value)
+        private static bool UInt32FourthByteIsAscii(uint value)
          {
-            // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
-            // directly rather than bounce a temporary through a register. That is, we want the JIT to be
-            // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
-            // to see if it's a continuation byte. Data that's already enregistered will go through the
-            // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
-            //
-            // The below check takes advantage of the two's complement representation of negative numbers.
-            // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return ((sbyte)value < -64);
+            return (BitConverter.IsLittleEndian && ((int)value >= 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0));
          }
  
          /// <summary>
-        /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
-        /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsWellFormedUtf16SurrogatePair(uint value)
+        private static bool UInt32SecondByteIsAscii(uint value)
          {
-            // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
-            // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
-            //
-            // We're essentially performing a range check on each component of the input in parallel. The allowed range
-            // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We
-            // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400
-            // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can*
-            // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc".
-
              // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
  
-            return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0)
-                || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0);
+            return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0));
          }
  
          /// <summary>
-        /// Converts a DWORD from machine-endian to little-endian.
+        /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+        /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static uint ToLittleEndian(uint value)
+        private static bool UInt32ThirdByteIsAscii(uint value)
          {
-            if (BitConverter.IsLittleEndian)
-            {
-                return value;
-            }
-            else
-            {
-                return BinaryPrimitives.ReverseEndianness(value);
-            }
+            // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+            return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0))
+                || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
          }
  
          /// <summary>
author	Levi Broderick <levib@microsoft.com>
	Wed, 10 Apr 2019 23:52:48 +0000 (16:52 -0700)
committer	Levi Broderick <levib@microsoft.com>
	Wed, 10 Apr 2019 23:52:48 +0000 (16:52 -0700)