From bae40fa2c2d8f9acb9b8148b7d6a6b91ace55058 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Wed, 10 Apr 2019 16:52:48 -0700 Subject: [PATCH] Re-flow Utf8Utility.Helpers --- .../System/Text/Unicode/Utf8Utility.Helpers.cs | 746 ++++++++++----------- 1 file changed, 373 insertions(+), 373 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs index b119922..c17c2cd 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -14,279 +14,6 @@ namespace System.Text.Unicode internal static partial class Utf8Utility { /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the first two bytes of the buffer are - /// an overlong representation of a sequence that should be represented as one byte. - /// This method *does not* validate that the sequence matches the appropriate - /// 2-byte sequence mask (see ). - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value) - { - // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input. - Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value)); - - // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF. - // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need - // to check that it's < C2. - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u)) - || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the first four bytes of the buffer match - /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This - /// method *does not* validate that the sequence is well-formed; the caller must - /// still perform overlong form or out-of-range checking. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32BeginsWithUtf8FourByteMask(uint value) - { - // The code in this method is equivalent to the code - // below but is slightly more optimized. - // - // if (BitConverter.IsLittleEndian) - // { - // const uint mask = 0xC0C0C0F8U; - // const uint comparand = 0x808080F0U; - // return ((value & mask) == comparand); - // } - // else - // { - // const uint mask = 0xF8C0C0C0U; - // const uint comparand = 0xF0808000U; - // return ((value & mask) == comparand); - // } - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0)) - || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the first three bytes of the buffer match - /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not* - /// validate that the sequence is well-formed; the caller must still perform - /// overlong form or surrogate checking. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value) - { - // The code in this method is equivalent to the code - // below but is slightly more optimized. - // - // if (BitConverter.IsLittleEndian) - // { - // const uint mask = 0x00C0C0F0U; - // const uint comparand = 0x008080E0U; - // return ((value & mask) == comparand); - // } - // else - // { - // const uint mask = 0xF0C0C000U; - // const uint comparand = 0xE0808000U; - // return ((value & mask) == comparand); - // } - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0)) - || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the first two bytes of the buffer match - /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not* - /// validate that the sequence is well-formed; the caller must still perform - /// overlong form checking. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32BeginsWithUtf8TwoByteMask(uint value) - { - // The code in this method is equivalent to the code - // below but is slightly more optimized. - // - // if (BitConverter.IsLittleEndian) - // { - // const uint mask = 0x0000C0E0U; - // const uint comparand = 0x000080C0U; - // return ((value & mask) == comparand); - // } - // else - // { - // const uint mask = 0xE0C00000U; - // const uint comparand = 0xC0800000U; - // return ((value & mask) == comparand); - // } - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0)) - || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the first two bytes of the buffer are - /// an overlong representation of a sequence that should be represented as one byte. - /// This method *does not* validate that the sequence matches the appropriate - /// 2-byte sequence mask (see ). - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value) - { - // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input. - Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value)); - - // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF. - // We already validated that it's 80 .. DF (per mask check earlier). - // C2 = 1100 0010 - // DF = 1101 1111 - // This means that we can AND the leading byte with the mask 0001 1110 (1E), - // and if the result is zero the sequence is overlong. - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0)) - || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the last two bytes of the buffer match - /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not* - /// validate that the sequence is well-formed; the caller must still perform - /// overlong form checking. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32EndsWithUtf8TwoByteMask(uint value) - { - // The code in this method is equivalent to the code - // below but is slightly more optimized. - // - // if (BitConverter.IsLittleEndian) - // { - // const uint mask = 0xC0E00000U; - // const uint comparand = 0x80C00000U; - // return ((value & mask) == comparand); - // } - // else - // { - // const uint mask = 0x0000E0C0U; - // const uint comparand = 0x0000C080U; - // return ((value & mask) == comparand); - // } - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0)) - || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine, - /// returns iff the first two bytes of the buffer are a well-formed - /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a - /// single operation. Returns if running on a big-endian machine. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value) - { - // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ]. - // In little-endian, that would be represented as: - // [ ######## ######## 10xxxxxx 110yyyyy ]. - // Due to the little-endian representation we can perform a trick by ANDing the low - // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within - // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the - // 2-byte-sequence bitmask check and overlong form validation with one comparison. - - Debug.Assert(BitConverter.IsLittleEndian); - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu)) - || (!BitConverter.IsLittleEndian && false); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine, - /// returns iff the last two bytes of the buffer are a well-formed - /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a - /// single operation. Returns if running on a big-endian machine. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value) - { - // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian. - - Debug.Assert(BitConverter.IsLittleEndian); - - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u)) - || (!BitConverter.IsLittleEndian && false); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the first byte of the buffer is ASCII. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32FirstByteIsAscii(uint value) - { - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0)) - || (!BitConverter.IsLittleEndian && ((int)value >= 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the fourth byte of the buffer is ASCII. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32FourthByteIsAscii(uint value) - { - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && ((int)value >= 0)) - || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the second byte of the buffer is ASCII. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32SecondByteIsAscii(uint value) - { - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0)) - || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0)); - } - - /// - /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, - /// returns iff the third byte of the buffer is ASCII. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool UInt32ThirdByteIsAscii(uint value) - { - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - - return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0)) - || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0)); - } - - /// /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the /// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation. /// @@ -567,191 +294,464 @@ namespace System.Text.Unicode /// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated). /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value) + private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value) + { + // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ]. + // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0) + || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u); + } + + /// + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the first UTF-16 character is a surrogate character (either high or low). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsFirstCharSurrogate(uint value) + { + // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ]. + // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0) + || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u); + } + + /// + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsFirstCharTwoUtf8Bytes(uint value) + { + // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ]. + // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ]. + + // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian + // case, but the JIT doesn't currently emit 16-bit comparisons efficiently. + // Tracked as https://github.com/dotnet/coreclr/issues/18022. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u) + || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu)); + } + + /// + /// Returns iff the low byte of + /// is a UTF-8 continuation byte. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsLowByteUtf8ContinuationByte(uint value) + { + // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte), + // so the best we can do for now is the lea / cmp pair. + // Tracked as https://github.com/dotnet/coreclr/issues/18022. + + return (byte)(value - 0x80u) <= 0x3Fu; + } + + /// + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character is ASCII. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharAscii(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ]. + // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && value < 0x0080_0000u) + || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0); + } + + /// + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8. + /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ]. + // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0) + || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0); + } + + /// + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character is a surrogate character (either high or low). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharSurrogate(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ]. + // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ]. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u) + || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0); + } + + /// + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSecondCharTwoUtf8Bytes(uint value) + { + // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ]. + // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ]. + + // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian + // case, but the JIT doesn't currently emit 16-bit comparisons efficiently. + // Tracked as https://github.com/dotnet/coreclr/issues/18022. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu)) + || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u); + } + + /// + /// Returns iff is a UTF-8 continuation byte; + /// i.e., has binary representation 10xxxxxx, where x is any bit. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsUtf8ContinuationByte(in byte value) + { + // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements + // directly rather than bounce a temporary through a register. That is, we want the JIT to be + // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location + // to see if it's a continuation byte. Data that's already enregistered will go through the + // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions. + // + // The below check takes advantage of the two's complement representation of negative numbers. + // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ] + + return ((sbyte)value < -64); + } + + /// + /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, + /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsWellFormedUtf16SurrogatePair(uint value) + { + // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ]. + // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ]. + // + // We're essentially performing a range check on each component of the input in parallel. The allowed range + // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We + // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400 + // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can* + // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc". + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0) + || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0); + } + + /// + /// Converts a DWORD from machine-endian to little-endian. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint ToLittleEndian(uint value) + { + if (BitConverter.IsLittleEndian) + { + return value; + } + else + { + return BinaryPrimitives.ReverseEndianness(value); + } + } + + /// + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the first two bytes of the buffer are + /// an overlong representation of a sequence that should be represented as one byte. + /// This method *does not* validate that the sequence matches the appropriate + /// 2-byte sequence mask (see ). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value) + { + // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input. + Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value)); + + // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF. + // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need + // to check that it's < C2. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u)) + || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u)); + } + + /// + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the first four bytes of the buffer match + /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This + /// method *does not* validate that the sequence is well-formed; the caller must + /// still perform overlong form or out-of-range checking. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UInt32BeginsWithUtf8FourByteMask(uint value) { - // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ]. - // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ]. + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0xC0C0C0F8U; + // const uint comparand = 0x808080F0U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0xF8C0C0C0U; + // const uint comparand = 0xF0808000U; + // return ((value & mask) == comparand); + // } // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0) - || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u); + return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0)); } /// - /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, - /// returns true iff the first UTF-16 character is a surrogate character (either high or low). + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the first three bytes of the buffer match + /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not* + /// validate that the sequence is well-formed; the caller must still perform + /// overlong form or surrogate checking. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsFirstCharSurrogate(uint value) + private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value) { - // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ]. - // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ]. + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0x00C0C0F0U; + // const uint comparand = 0x008080E0U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0xF0C0C000U; + // const uint comparand = 0xE0808000U; + // return ((value & mask) == comparand); + // } // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0) - || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u); + return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0)); } /// - /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, - /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8. + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the first two bytes of the buffer match + /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not* + /// validate that the sequence is well-formed; the caller must still perform + /// overlong form checking. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsFirstCharTwoUtf8Bytes(uint value) + private static bool UInt32BeginsWithUtf8TwoByteMask(uint value) { - // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ]. - // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ]. - - // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian - // case, but the JIT doesn't currently emit 16-bit comparisons efficiently. - // Tracked as https://github.com/dotnet/coreclr/issues/18022. + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0x0000C0E0U; + // const uint comparand = 0x000080C0U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0xE0C00000U; + // const uint comparand = 0xC0800000U; + // return ((value & mask) == comparand); + // } // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u) - || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu)); + return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0)); } /// - /// Returns iff the low byte of - /// is a UTF-8 continuation byte. + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the first two bytes of the buffer are + /// an overlong representation of a sequence that should be represented as one byte. + /// This method *does not* validate that the sequence matches the appropriate + /// 2-byte sequence mask (see ). /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsLowByteUtf8ContinuationByte(uint value) + private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value) { - // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte), - // so the best we can do for now is the lea / cmp pair. - // Tracked as https://github.com/dotnet/coreclr/issues/18022. + // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input. + Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value)); - return (byte)(value - 0x80u) <= 0x3Fu; + // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF. + // We already validated that it's 80 .. DF (per mask check earlier). + // C2 = 1100 0010 + // DF = 1101 1111 + // This means that we can AND the leading byte with the mask 0001 1110 (1E), + // and if the result is zero the sequence is overlong. + + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0)); } /// - /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, - /// returns true iff the second UTF-16 character is ASCII. + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the last two bytes of the buffer match + /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not* + /// validate that the sequence is well-formed; the caller must still perform + /// overlong form checking. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsSecondCharAscii(uint value) + private static bool UInt32EndsWithUtf8TwoByteMask(uint value) { - // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ]. - // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ]. + // The code in this method is equivalent to the code + // below but is slightly more optimized. + // + // if (BitConverter.IsLittleEndian) + // { + // const uint mask = 0xC0E00000U; + // const uint comparand = 0x80C00000U; + // return ((value & mask) == comparand); + // } + // else + // { + // const uint mask = 0x0000E0C0U; + // const uint comparand = 0x0000C080U; + // return ((value & mask) == comparand); + // } // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && value < 0x0080_0000u) - || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0); + return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0)) + || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0)); } /// - /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, - /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8. - /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated). + /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine, + /// returns iff the first two bytes of the buffer are a well-formed + /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a + /// single operation. Returns if running on a big-endian machine. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value) + private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value) { - // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ]. - // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ]. + // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ]. + // In little-endian, that would be represented as: + // [ ######## ######## 10xxxxxx 110yyyyy ]. + // Due to the little-endian representation we can perform a trick by ANDing the low + // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within + // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the + // 2-byte-sequence bitmask check and overlong form validation with one comparison. + + Debug.Assert(BitConverter.IsLittleEndian); // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0) - || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0); + return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu)) + || (!BitConverter.IsLittleEndian && false); } /// - /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, - /// returns true iff the second UTF-16 character is a surrogate character (either high or low). + /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine, + /// returns iff the last two bytes of the buffer are a well-formed + /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a + /// single operation. Returns if running on a big-endian machine. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsSecondCharSurrogate(uint value) + private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value) { - // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ]. - // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ]. + // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian. + + Debug.Assert(BitConverter.IsLittleEndian); // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u) - || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0); + return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u)) + || (!BitConverter.IsLittleEndian && false); } /// - /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, - /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8. + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the first byte of the buffer is ASCII. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsSecondCharTwoUtf8Bytes(uint value) + private static bool UInt32FirstByteIsAscii(uint value) { - // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ]. - // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ]. - - // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian - // case, but the JIT doesn't currently emit 16-bit comparisons efficiently. - // Tracked as https://github.com/dotnet/coreclr/issues/18022. - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu)) - || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u); + return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0)) + || (!BitConverter.IsLittleEndian && ((int)value >= 0)); } /// - /// Returns iff is a UTF-8 continuation byte; - /// i.e., has binary representation 10xxxxxx, where x is any bit. + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the fourth byte of the buffer is ASCII. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsUtf8ContinuationByte(in byte value) + private static bool UInt32FourthByteIsAscii(uint value) { - // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements - // directly rather than bounce a temporary through a register. That is, we want the JIT to be - // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location - // to see if it's a continuation byte. Data that's already enregistered will go through the - // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions. - // - // The below check takes advantage of the two's complement representation of negative numbers. - // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ] + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return ((sbyte)value < -64); + return (BitConverter.IsLittleEndian && ((int)value >= 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0)); } /// - /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order, - /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair. + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the second byte of the buffer is ASCII. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsWellFormedUtf16SurrogatePair(uint value) + private static bool UInt32SecondByteIsAscii(uint value) { - // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ]. - // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ]. - // - // We're essentially performing a range check on each component of the input in parallel. The allowed range - // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We - // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400 - // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can* - // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc". - // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. - return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0) - || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0); + return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0)); } /// - /// Converts a DWORD from machine-endian to little-endian. + /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness, + /// returns iff the third byte of the buffer is ASCII. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static uint ToLittleEndian(uint value) + private static bool UInt32ThirdByteIsAscii(uint value) { - if (BitConverter.IsLittleEndian) - { - return value; - } - else - { - return BinaryPrimitives.ReverseEndianness(value); - } + // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914. + + return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0)) + || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0)); } /// -- 2.7.4