From 84eaa7ac079e625f2fbe36ba976f735dbdacdc6b Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Thu, 14 Feb 2019 11:00:02 -0800 Subject: [PATCH] Add Rune creation API from UTF-16 surrogate pair (#22590) Also brings in some perf improvements to existing char and UnicodeUtility APIs --- src/System.Private.CoreLib/shared/System/Char.cs | 48 +++++++++++++++++++--- .../shared/System/Text/Rune.cs | 42 +++++++++++++++++++ .../shared/System/Text/UnicodeUtility.cs | 16 +++++--- 3 files changed, 95 insertions(+), 11 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Char.cs b/src/System.Private.CoreLib/shared/System/Char.cs index 1312380..50dd092 100644 --- a/src/System.Private.CoreLib/shared/System/Char.cs +++ b/src/System.Private.CoreLib/shared/System/Char.cs @@ -904,7 +904,14 @@ namespace System public static bool IsSurrogatePair(char highSurrogate, char lowSurrogate) { - return IsHighSurrogate(highSurrogate) && IsLowSurrogate(lowSurrogate); + // Since both the high and low surrogate ranges are exactly 0x400 elements + // wide, and since this is a power of two, we can perform a single comparison + // by baselining each value to the start of its respective range and taking + // the logical OR of them. + + uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START; + uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START; + return (highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE; } internal const int UNICODE_PLANE00_END = 0x00ffff; @@ -937,15 +944,44 @@ namespace System public static int ConvertToUtf32(char highSurrogate, char lowSurrogate) { - if (!IsHighSurrogate(highSurrogate)) + // First, extend both to 32 bits, then calculate the offset of + // each candidate surrogate char from the start of its range. + + uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START; + uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START; + + // This is a single comparison which allows us to check both for validity at once since + // both the high surrogate range and the low surrogate range are the same length. + // If the comparison fails, we call to a helper method to throw the correct exception message. + + if ((highSurrogateOffset | lowSurrogateOffset) > CharUnicodeInfo.HIGH_SURROGATE_RANGE) + { + ConvertToUtf32_ThrowInvalidArgs(highSurrogateOffset); + } + + // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding. + return ((int)highSurrogateOffset << 10) + (lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40 << 10); + } + + [StackTraceHidden] + private static void ConvertToUtf32_ThrowInvalidArgs(uint highSurrogateOffset) + { + // If the high surrogate is not within its expected range, throw an exception + // whose message fingers it as invalid. If it's within the expected range, + // change the message to read that the low surrogate was the problem. + + if (highSurrogateOffset > CharUnicodeInfo.HIGH_SURROGATE_RANGE) { - throw new ArgumentOutOfRangeException(nameof(highSurrogate), SR.ArgumentOutOfRange_InvalidHighSurrogate); + throw new ArgumentOutOfRangeException( + paramName: "highSurrogate", + message: SR.ArgumentOutOfRange_InvalidHighSurrogate); } - if (!IsLowSurrogate(lowSurrogate)) + else { - throw new ArgumentOutOfRangeException(nameof(lowSurrogate), SR.ArgumentOutOfRange_InvalidLowSurrogate); + throw new ArgumentOutOfRangeException( + paramName: "lowSurrogate", + message: SR.ArgumentOutOfRange_InvalidLowSurrogate); } - return (((highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START) * 0x400) + (lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + UNICODE_PLANE01_START); } /*=============================ConvertToUtf32=================================== diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index 74aecbe..35733dc 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -59,6 +59,18 @@ namespace System.Text } /// + /// Creates a from the provided UTF-16 surrogate pair. + /// + /// + /// If does not represent a UTF-16 high surrogate code point + /// or does not represent a UTF-16 low surrogate code point. + /// + public Rune(char highSurrogate, char lowSurrogate) + : this((uint)char.ConvertToUtf32(highSurrogate, lowSurrogate), false) + { + } + + /// /// Creates a from the provided Unicode scalar value. /// /// @@ -365,6 +377,36 @@ namespace System.Text } /// + /// Attempts to create a from the provided UTF-16 surrogate pair. + /// Returns if the input values don't represent a well-formed UTF-16surrogate pair. + /// + public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result) + { + // First, extend both to 32 bits, then calculate the offset of + // each candidate surrogate char from the start of its range. + + uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START; + uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START; + + // This is a single comparison which allows us to check both for validity at once since + // both the high surrogate range and the low surrogate range are the same length. + // If the comparison fails, we call to a helper method to throw the correct exception message. + + if ((highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE) + { + // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding. + result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40u << 10)); + return true; + } + else + { + // Didn't have a high surrogate followed by a low surrogate. + result = default; + return false; + } + } + + /// /// Attempts to create a from the provided input value. /// public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result); diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs index e607acd..3aad296 100644 --- a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs +++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs @@ -169,12 +169,18 @@ namespace System.Text [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsValidUnicodeScalar(uint value) { - // By XORing the incoming value with 0xD800, surrogate code points - // are moved to the range [ U+0000..U+07FF ], and all valid scalar - // values are clustered into the single range [ U+0800..U+10FFFF ], - // which allows performing a single fast range check. + // This is an optimized check that on x86 is just three instructions: lea, xor, cmp. + // + // After the subtraction operation, the input value is modified as such: + // [ 00000000..0010FFFF ] -> [ FFEF0000..FFFFFFFF ] + // + // We now want to _exclude_ the range [ FFEFD800..FFEFDFFF ] (surrogates) from being valid. + // After the xor, this particular exclusion range becomes [ FFEF0000..FFEF07FF ]. + // + // So now the range [ FFEF0800..FFFFFFFF ] contains all valid code points, + // excluding surrogates. This allows us to perform a single comparison. - return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU); + return ((value - 0x110000u) ^ 0xD800u) >= 0xFFEF0800u; } } } -- 2.7.4