From 84eaa7ac079e625f2fbe36ba976f735dbdacdc6b Mon Sep 17 00:00:00 2001
From: Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Date: Thu, 14 Feb 2019 11:00:02 -0800
Subject: [PATCH] Add Rune creation API from UTF-16 surrogate pair (#22590)

Also brings in some perf improvements to existing char and UnicodeUtility APIs
---
 src/System.Private.CoreLib/shared/System/Char.cs   | 48 +++++++++++++++++++---
 .../shared/System/Text/Rune.cs                     | 42 +++++++++++++++++++
 .../shared/System/Text/UnicodeUtility.cs           | 16 +++++---
 3 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/src/System.Private.CoreLib/shared/System/Char.cs b/src/System.Private.CoreLib/shared/System/Char.cs
index 1312380..50dd092 100644
--- a/src/System.Private.CoreLib/shared/System/Char.cs
+++ b/src/System.Private.CoreLib/shared/System/Char.cs
@@ -904,7 +904,14 @@ namespace System
 
         public static bool IsSurrogatePair(char highSurrogate, char lowSurrogate)
         {
-            return IsHighSurrogate(highSurrogate) && IsLowSurrogate(lowSurrogate);
+            // Since both the high and low surrogate ranges are exactly 0x400 elements
+            // wide, and since this is a power of two, we can perform a single comparison
+            // by baselining each value to the start of its respective range and taking
+            // the logical OR of them.
+
+            uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
+            uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
+            return (highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE;
         }
 
         internal const int UNICODE_PLANE00_END = 0x00ffff;
@@ -937,15 +944,44 @@ namespace System
 
         public static int ConvertToUtf32(char highSurrogate, char lowSurrogate)
         {
-            if (!IsHighSurrogate(highSurrogate))
+            // First, extend both to 32 bits, then calculate the offset of
+            // each candidate surrogate char from the start of its range.
+
+            uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
+            uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
+
+            // This is a single comparison which allows us to check both for validity at once since
+            // both the high surrogate range and the low surrogate range are the same length.
+            // If the comparison fails, we call to a helper method to throw the correct exception message.
+
+            if ((highSurrogateOffset | lowSurrogateOffset) > CharUnicodeInfo.HIGH_SURROGATE_RANGE)
+            {
+                ConvertToUtf32_ThrowInvalidArgs(highSurrogateOffset);
+            }
+
+            // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
+            return ((int)highSurrogateOffset << 10) + (lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40 << 10);
+        }
+
+        [StackTraceHidden]
+        private static void ConvertToUtf32_ThrowInvalidArgs(uint highSurrogateOffset)
+        {
+            // If the high surrogate is not within its expected range, throw an exception
+            // whose message fingers it as invalid. If it's within the expected range,
+            // change the message to read that the low surrogate was the problem.
+
+            if (highSurrogateOffset > CharUnicodeInfo.HIGH_SURROGATE_RANGE)
             {
-                throw new ArgumentOutOfRangeException(nameof(highSurrogate), SR.ArgumentOutOfRange_InvalidHighSurrogate);
+                throw new ArgumentOutOfRangeException(
+                    paramName: "highSurrogate",
+                    message: SR.ArgumentOutOfRange_InvalidHighSurrogate);
             }
-            if (!IsLowSurrogate(lowSurrogate))
+            else
             {
-                throw new ArgumentOutOfRangeException(nameof(lowSurrogate), SR.ArgumentOutOfRange_InvalidLowSurrogate);
+                throw new ArgumentOutOfRangeException(
+                    paramName: "lowSurrogate",
+                    message: SR.ArgumentOutOfRange_InvalidLowSurrogate);
             }
-            return (((highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START) * 0x400) + (lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + UNICODE_PLANE01_START);
         }
 
         /*=============================ConvertToUtf32===================================
diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
index 74aecbe..35733dc 100644
--- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs
@@ -59,6 +59,18 @@ namespace System.Text
         }
 
         /// <summary>
+        /// Creates a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
+        /// </summary>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// If <paramref name="highSurrogate"/> does not represent a UTF-16 high surrogate code point
+        /// or <paramref name="lowSurrogate"/> does not represent a UTF-16 low surrogate code point.
+        /// </exception>
+        public Rune(char highSurrogate, char lowSurrogate)
+            : this((uint)char.ConvertToUtf32(highSurrogate, lowSurrogate), false)
+        {
+        }
+
+        /// <summary>
         /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
         /// </summary>
         /// <exception cref="ArgumentOutOfRangeException">
@@ -365,6 +377,36 @@ namespace System.Text
         }
 
         /// <summary>
+        /// Attempts to create a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
+        /// Returns <see langword="false"/> if the input values don't represent a well-formed UTF-16surrogate pair.
+        /// </summary>
+        public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result)
+        {
+            // First, extend both to 32 bits, then calculate the offset of
+            // each candidate surrogate char from the start of its range.
+
+            uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
+            uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
+
+            // This is a single comparison which allows us to check both for validity at once since
+            // both the high surrogate range and the low surrogate range are the same length.
+            // If the comparison fails, we call to a helper method to throw the correct exception message.
+
+            if ((highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE)
+            {
+                // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
+                result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40u << 10));
+                return true;
+            }
+            else
+            {
+                // Didn't have a high surrogate followed by a low surrogate.
+                result = default;
+                return false;
+            }
+        }
+
+        /// <summary>
         /// Attempts to create a <see cref="Rune"/> from the provided input value.
         /// </summary>
         public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
index e607acd..3aad296 100644
--- a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs
@@ -169,12 +169,18 @@ namespace System.Text
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static bool IsValidUnicodeScalar(uint value)
         {
-            // By XORing the incoming value with 0xD800, surrogate code points
-            // are moved to the range [ U+0000..U+07FF ], and all valid scalar
-            // values are clustered into the single range [ U+0800..U+10FFFF ],
-            // which allows performing a single fast range check.
+            // This is an optimized check that on x86 is just three instructions: lea, xor, cmp.
+            // 
+            // After the subtraction operation, the input value is modified as such:
+            // [ 00000000..0010FFFF ] -> [ FFEF0000..FFFFFFFF ]
+            //
+            // We now want to _exclude_ the range [ FFEFD800..FFEFDFFF ] (surrogates) from being valid.
+            // After the xor, this particular exclusion range becomes [ FFEF0000..FFEF07FF ].
+            //
+            // So now the range [ FFEF0800..FFFFFFFF ] contains all valid code points,
+            // excluding surrogates. This allows us to perform a single comparison.
 
-            return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU);
+            return ((value - 0x110000u) ^ 0xD800u) >= 0xFFEF0800u;
         }
     }
 }
-- 
2.7.4