internal static partial class Utf8Utility
{
/// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the first two bytes of the buffer are
- /// an overlong representation of a sequence that should be represented as one byte.
- /// This method *does not* validate that the sequence matches the appropriate
- /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
- {
- // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
- Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));
-
- // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
- // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need
- // to check that it's < C2.
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u))
- || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the first four bytes of the buffer match
- /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This
- /// method *does not* validate that the sequence is well-formed; the caller must
- /// still perform overlong form or out-of-range checking.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
- {
- // The code in this method is equivalent to the code
- // below but is slightly more optimized.
- //
- // if (BitConverter.IsLittleEndian)
- // {
- // const uint mask = 0xC0C0C0F8U;
- // const uint comparand = 0x808080F0U;
- // return ((value & mask) == comparand);
- // }
- // else
- // {
- // const uint mask = 0xF8C0C0C0U;
- // const uint comparand = 0xF0808000U;
- // return ((value & mask) == comparand);
- // }
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
- || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the first three bytes of the buffer match
- /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not*
- /// validate that the sequence is well-formed; the caller must still perform
- /// overlong form or surrogate checking.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
- {
- // The code in this method is equivalent to the code
- // below but is slightly more optimized.
- //
- // if (BitConverter.IsLittleEndian)
- // {
- // const uint mask = 0x00C0C0F0U;
- // const uint comparand = 0x008080E0U;
- // return ((value & mask) == comparand);
- // }
- // else
- // {
- // const uint mask = 0xF0C0C000U;
- // const uint comparand = 0xE0808000U;
- // return ((value & mask) == comparand);
- // }
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0))
- || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the first two bytes of the buffer match
- /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
- /// validate that the sequence is well-formed; the caller must still perform
- /// overlong form checking.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
- {
- // The code in this method is equivalent to the code
- // below but is slightly more optimized.
- //
- // if (BitConverter.IsLittleEndian)
- // {
- // const uint mask = 0x0000C0E0U;
- // const uint comparand = 0x000080C0U;
- // return ((value & mask) == comparand);
- // }
- // else
- // {
- // const uint mask = 0xE0C00000U;
- // const uint comparand = 0xC0800000U;
- // return ((value & mask) == comparand);
- // }
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0))
- || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the first two bytes of the buffer are
- /// an overlong representation of a sequence that should be represented as one byte.
- /// This method *does not* validate that the sequence matches the appropriate
- /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
- {
- // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
- Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));
-
- // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
- // We already validated that it's 80 .. DF (per mask check earlier).
- // C2 = 1100 0010
- // DF = 1101 1111
- // This means that we can AND the leading byte with the mask 0001 1110 (1E),
- // and if the result is zero the sequence is overlong.
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0))
- || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the last two bytes of the buffer match
- /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
- /// validate that the sequence is well-formed; the caller must still perform
- /// overlong form checking.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
- {
- // The code in this method is equivalent to the code
- // below but is slightly more optimized.
- //
- // if (BitConverter.IsLittleEndian)
- // {
- // const uint mask = 0xC0E00000U;
- // const uint comparand = 0x80C00000U;
- // return ((value & mask) == comparand);
- // }
- // else
- // {
- // const uint mask = 0x0000E0C0U;
- // const uint comparand = 0x0000C080U;
- // return ((value & mask) == comparand);
- // }
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0))
- || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
- /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed
- /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
- /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
- {
- // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
- // In little-endian, that would be represented as:
- // [ ######## ######## 10xxxxxx 110yyyyy ].
- // Due to the little-endian representation we can perform a trick by ANDing the low
- // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within
- // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the
- // 2-byte-sequence bitmask check and overlong form validation with one comparison.
-
- Debug.Assert(BitConverter.IsLittleEndian);
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu))
- || (!BitConverter.IsLittleEndian && false);
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
- /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed
- /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
- /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
- {
- // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.
-
- Debug.Assert(BitConverter.IsLittleEndian);
-
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u))
- || (!BitConverter.IsLittleEndian && false);
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32FirstByteIsAscii(uint value)
- {
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0))
- || (!BitConverter.IsLittleEndian && ((int)value >= 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32FourthByteIsAscii(uint value)
- {
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && ((int)value >= 0))
- || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32SecondByteIsAscii(uint value)
- {
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0))
- || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0));
- }
-
- /// <summary>
- /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
- /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
- /// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool UInt32ThirdByteIsAscii(uint value)
- {
- // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
-
- return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0))
- || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
- }
-
- /// <summary>
/// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
/// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation.
/// </summary>
/// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
+ private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
+ // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0)
+ || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsFirstCharSurrogate(uint value)
+ {
+ // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
+ // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0)
+ || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsFirstCharTwoUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
+ // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].
+
+ // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian
+ // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+ // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u)
+ || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu));
+ }
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
+ /// is a UTF-8 continuation byte.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsLowByteUtf8ContinuationByte(uint value)
+ {
+ // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
+ // so the best we can do for now is the lea / cmp pair.
+ // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+ return (byte)(value - 0x80u) <= 0x3Fu;
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character is ASCII.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharAscii(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
+ // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && value < 0x0080_0000u)
+ || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
+ /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
+ // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0)
+ || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharSurrogate(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
+ // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u)
+ || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsSecondCharTwoUtf8Bytes(uint value)
+ {
+ // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
+ // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].
+
+ // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian
+ // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
+ // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu))
+ || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u);
+ }
+
+ /// <summary>
+ /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
+ /// i.e., has binary representation 10xxxxxx, where x is any bit.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsUtf8ContinuationByte(in byte value)
+ {
+ // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
+ // directly rather than bounce a temporary through a register. That is, we want the JIT to be
+ // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
+ // to see if it's a continuation byte. Data that's already enregistered will go through the
+ // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
+ //
+ // The below check takes advantage of the two's complement representation of negative numbers.
+ // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
+
+ return ((sbyte)value < -64);
+ }
+
+ /// <summary>
+ /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
+ /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool IsWellFormedUtf16SurrogatePair(uint value)
+ {
+ // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
+ // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
+ //
+ // We're essentially performing a range check on each component of the input in parallel. The allowed range
+ // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We
+ // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400
+ // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can*
+ // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc".
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0)
+ || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0);
+ }
+
+ /// <summary>
+ /// Converts a DWORD from machine-endian to little-endian.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static uint ToLittleEndian(uint value)
+ {
+ if (BitConverter.IsLittleEndian)
+ {
+ return value;
+ }
+ else
+ {
+ return BinaryPrimitives.ReverseEndianness(value);
+ }
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer are
+ /// an overlong representation of a sequence that should be represented as one byte.
+ /// This method *does not* validate that the sequence matches the appropriate
+ /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
+ {
+ // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+ Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));
+
+ // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+ // Since we already validated it's 80 <= ?? <= DF (per mask check earlier), now only need
+ // to check that it's < C2.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((byte)value < 0xC2u))
+ || (!BitConverter.IsLittleEndian && (value < 0xC200_0000u));
+ }
+
+ /// <summary>
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first four bytes of the buffer match
+ /// the UTF-8 4-byte sequence mask [ 11110www 10zzzzzz 10yyyyyy 10xxxxxx ]. This
+ /// method *does not* validate that the sequence is well-formed; the caller must
+ /// still perform overlong form or out-of-range checking.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
{
- // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
- // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0xC0C0C0F8U;
+ // const uint comparand = 0x808080F0U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0xF8C0C0C0U;
+ // const uint comparand = 0xF0808000U;
+ // return ((value & mask) == comparand);
+ // }
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && (value & 0xF800u) != 0)
- || (!BitConverter.IsLittleEndian && value >= 0x0800_0000u);
+ return (BitConverter.IsLittleEndian && (((value - 0x8080_80F0u) & 0xC0C0_C0F8u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0xF080_8000u) & 0xF8C0_C0C0u) == 0));
}
/// <summary>
- /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
- /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first three bytes of the buffer match
+ /// the UTF-8 3-byte sequence mask [ 1110zzzz 10yyyyyy 10xxxxxx ]. This method *does not*
+ /// validate that the sequence is well-formed; the caller must still perform
+ /// overlong form or surrogate checking.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsFirstCharSurrogate(uint value)
+ private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
{
- // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
- // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0x00C0C0F0U;
+ // const uint comparand = 0x008080E0U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0xF0C0C000U;
+ // const uint comparand = 0xE0808000U;
+ // return ((value & mask) == comparand);
+ // }
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0)
- || (!BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u);
+ return (BitConverter.IsLittleEndian && (((value - 0x0080_80E0u) & 0x00C0_C0F0u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0xE080_8000u) & 0xF0C0_C000u) == 0));
}
/// <summary>
- /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
- /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer match
+ /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+ /// validate that the sequence is well-formed; the caller must still perform
+ /// overlong form checking.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsFirstCharTwoUtf8Bytes(uint value)
+ private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
{
- // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
- // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].
-
- // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the little-endian
- // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
- // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0x0000C0E0U;
+ // const uint comparand = 0x000080C0U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0xE0C00000U;
+ // const uint comparand = 0xC0800000U;
+ // return ((value & mask) == comparand);
+ // }
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u)
- || (!BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu));
+ return (BitConverter.IsLittleEndian && (((value - 0x0000_80C0u) & 0x0000_C0E0u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0xC080_0000u) & 0xE0C0_0000u) == 0));
}
/// <summary>
- /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
- /// is a UTF-8 continuation byte.
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer are
+ /// an overlong representation of a sequence that should be represented as one byte.
+ /// This method *does not* validate that the sequence matches the appropriate
+ /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsLowByteUtf8ContinuationByte(uint value)
+ private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
{
- // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
- // so the best we can do for now is the lea / cmp pair.
- // Tracked as https://github.com/dotnet/coreclr/issues/18022.
+ // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
+ Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));
- return (byte)(value - 0x80u) <= 0x3Fu;
+ // Per Table 3-7, first byte of two-byte sequence must be within range C2 .. DF.
+ // We already validated that it's 80 .. DF (per mask check earlier).
+ // C2 = 1100 0010
+ // DF = 1101 1111
+ // This means that we can AND the leading byte with the mask 0001 1110 (1E),
+ // and if the result is zero the sequence is overlong.
+
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value & 0x001E_0000u) == 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x1E00u) == 0));
}
/// <summary>
- /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
- /// returns true iff the second UTF-16 character is ASCII.
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the last two bytes of the buffer match
+ /// the UTF-8 2-byte sequence mask [ 110yyyyy 10xxxxxx ]. This method *does not*
+ /// validate that the sequence is well-formed; the caller must still perform
+ /// overlong form checking.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsSecondCharAscii(uint value)
+ private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
{
- // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
- // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].
+ // The code in this method is equivalent to the code
+ // below but is slightly more optimized.
+ //
+ // if (BitConverter.IsLittleEndian)
+ // {
+ // const uint mask = 0xC0E00000U;
+ // const uint comparand = 0x80C00000U;
+ // return ((value & mask) == comparand);
+ // }
+ // else
+ // {
+ // const uint mask = 0x0000E0C0U;
+ // const uint comparand = 0x0000C080U;
+ // return ((value & mask) == comparand);
+ // }
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && value < 0x0080_0000u)
- || (!BitConverter.IsLittleEndian && (value & 0xFF80u) == 0);
+ return (BitConverter.IsLittleEndian && (((value - 0x80C0_0000u) & 0xC0E0_0000u) == 0))
+ || (!BitConverter.IsLittleEndian && (((value - 0x0000_C080u) & 0x0000_E0C0u) == 0));
}
/// <summary>
- /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
- /// returns true iff the second UTF-16 character requires *at least* 3 bytes to encode in UTF-8.
- /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
+ /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+ /// returns <see langword="true"/> iff the first two bytes of the buffer are a well-formed
+ /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+ /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
+ private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
{
- // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
- // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].
+ // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
+ // In little-endian, that would be represented as:
+ // [ ######## ######## 10xxxxxx 110yyyyy ].
+ // Due to the little-endian representation we can perform a trick by ANDing the low
+ // WORD with the bitmask [ 11000000 11111111 ] and checking that the value is within
+ // the range [ 10000000_11000010, 10000000_11011111 ]. This performs both the
+ // 2-byte-sequence bitmask check and overlong form validation with one comparison.
+
+ Debug.Assert(BitConverter.IsLittleEndian);
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && (value & 0xF800_0000u) != 0)
- || (!BitConverter.IsLittleEndian && (value & 0xF800u) != 0);
+ return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FFu, 0x80C2u, 0x80DFu))
+ || (!BitConverter.IsLittleEndian && false);
}
/// <summary>
- /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
- /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
+ /// Given a UTF-8 buffer which has been read into a DWORD on a little-endian machine,
+ /// returns <see langword="true"/> iff the last two bytes of the buffer are a well-formed
+ /// UTF-8 two-byte sequence. This wraps the mask check and the overlong check into a
+ /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsSecondCharSurrogate(uint value)
+ private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
{
- // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
- // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].
+ // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.
+
+ Debug.Assert(BitConverter.IsLittleEndian);
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && (value - 0xD800_0000u) < 0x0800_0000u)
- || (!BitConverter.IsLittleEndian && ((value - 0xD800u) & 0xF800u) == 0);
+ return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value & 0xC0FF_0000u, 0x80C2_0000u, 0x80DF_0000u))
+ || (!BitConverter.IsLittleEndian && false);
}
/// <summary>
- /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
- /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsSecondCharTwoUtf8Bytes(uint value)
+ private static bool UInt32FirstByteIsAscii(uint value)
{
- // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
- // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].
-
- // TODO: I'd like to be able to write "(ushort)(value - 0x0080u) < 0x0780u" for the big-endian
- // case, but the JIT doesn't currently emit 16-bit comparisons efficiently.
- // Tracked as https://github.com/dotnet/coreclr/issues/18022.
-
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && UnicodeUtility.IsInRangeInclusive(value, 0x0080_0000u, 0x07FF_FFFFu))
- || (!BitConverter.IsLittleEndian && ((value - 0x0080u) & 0xFFFFu) < 0x0780u);
+ return (BitConverter.IsLittleEndian && ((value & 0x80u) == 0))
+ || (!BitConverter.IsLittleEndian && ((int)value >= 0));
}
/// <summary>
- /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-8 continuation byte;
- /// i.e., has binary representation 10xxxxxx, where x is any bit.
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsUtf8ContinuationByte(in byte value)
+ private static bool UInt32FourthByteIsAscii(uint value)
{
- // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements
- // directly rather than bounce a temporary through a register. That is, we want the JIT to be
- // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location
- // to see if it's a continuation byte. Data that's already enregistered will go through the
- // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions.
- //
- // The below check takes advantage of the two's complement representation of negative numbers.
- // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return ((sbyte)value < -64);
+ return (BitConverter.IsLittleEndian && ((int)value >= 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x80u) == 0));
}
/// <summary>
- /// Given a 32-bit integer that represents two packed UTF-16 characters, all in machine-endian order,
- /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static bool IsWellFormedUtf16SurrogatePair(uint value)
+ private static bool UInt32SecondByteIsAscii(uint value)
{
- // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
- // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
- //
- // We're essentially performing a range check on each component of the input in parallel. The allowed range
- // ends up being "< 0x0400" after the beginning of the allowed range is subtracted from each element. We
- // can't perform the equivalent of two CMPs in parallel, but we can take advantage of the fact that 0x0400
- // is a whole power of 2, which means that a CMP is really just a glorified TEST operation. Two TESTs *can*
- // be performed in parallel. The logic below then becomes 3 operations: "add/lea; test; jcc".
-
// Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
- return (BitConverter.IsLittleEndian && ((value - 0xDC00_D800u) & 0xFC00_FC00u) == 0)
- || (!BitConverter.IsLittleEndian && ((value - 0xD800_DC00u) & 0xFC00_FC00u) == 0);
+ return (BitConverter.IsLittleEndian && ((value & 0x8000u) == 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0));
}
/// <summary>
- /// Converts a DWORD from machine-endian to little-endian.
+ /// Given a UTF-8 buffer which has been read into a DWORD in machine endianness,
+ /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static uint ToLittleEndian(uint value)
+ private static bool UInt32ThirdByteIsAscii(uint value)
{
- if (BitConverter.IsLittleEndian)
- {
- return value;
- }
- else
- {
- return BinaryPrimitives.ReverseEndianness(value);
- }
+ // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
+
+ return (BitConverter.IsLittleEndian && ((value & 0x0080_0000u) == 0))
+ || (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
}
/// <summary>