From 27cd821920a69ea6456284654e8d2b5c24de6b15 Mon Sep 17 00:00:00 2001
From: Jan Kotas <jkotas@microsoft.com>
Date: Mon, 4 Jun 2018 13:09:29 -0700
Subject: [PATCH] Make Encoding.Unicode.GetString faster on 64-bit platforms
 (#18263)

UnicodeEncoding was taking fast path only when both source and destination were 8-byte aligned. It was not
the case for Unicode.GetString.

The fix is to just check the source alignment as fast path precondition, and use unaligned write for destination.

Also removed alternative fast path that is not needed anymore, tweaked local variable assignments for better
register allocation, and fixed performance issue introduced by recent big endian portability fix.
---
 .../shared/System/Text/UnicodeEncoding.cs          | 156 +++++++--------------
 1 file changed, 51 insertions(+), 105 deletions(-)

diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeEncoding.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeEncoding.cs
index b654838..8386c49 100644
--- a/src/System.Private.CoreLib/shared/System/Text/UnicodeEncoding.cs
+++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeEncoding.cs
@@ -6,11 +6,17 @@
 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
 //
 
+// This define can be used to turn off the fast loops. Useful for finding whether
+// the problem is fastloop-specific.
+#define FASTLOOP
+
 using System;
 using System.Globalization;
 using System.Diagnostics;
 using System.Runtime.InteropServices;
 
+using Internal.Runtime.CompilerServices;
+
 namespace System.Text
 {
     public class UnicodeEncoding : Encoding
@@ -23,17 +29,14 @@ namespace System.Text
         private static readonly byte[] s_bigEndianPreamble = new byte[2] { 0xfe, 0xff };
         private static readonly byte[] s_littleEndianPreamble = new byte[2] { 0xff, 0xfe };
 
-        internal bool isThrowException = false;
+        private bool isThrowException = false;
 
-        internal bool bigEndian = false;
-        internal bool byteOrderMark = true;
+        private bool bigEndian = false;
+        private bool byteOrderMark = true;
 
         // Unicode version 2.0 character size in bytes
         public const int CharSize = 2;
 
-        // endianness-based bit pattern mask.
-        static readonly ulong highLowPatternMask = ((ulong) 0xd800d800d800d800 | (BitConverter.IsLittleEndian ? (ulong) 0x0400000004000000 : (ulong) 0x0000040000000400)); 
-
         public UnicodeEncoding()
             : this(false, true)
         {
@@ -374,11 +377,6 @@ namespace System.Text
 
             bool wasHereBefore = false;
 
-            // Need -1 to check 2 at a time.  If we have an even #, longChars will go
-            // from longEnd - 1/2 long to longEnd + 1/2 long.  If we're odd, longChars
-            // will go from longEnd - 1 long to longEnd. (Might not get to use this)
-            ulong* longEnd = (ulong*)(charEnd - 3);
-
             // For fallback we may need a fallback buffer
             EncoderFallbackBuffer fallbackBuffer = null;
             char* charsForFallback;
@@ -412,16 +410,21 @@ namespace System.Text
                 if (ch == 0)
                 {
                     // No fallback, maybe we can do it fast
-#if !NO_FAST_UNICODE_LOOP
+#if FASTLOOP
                     // If endianess is backwards then each pair of bytes would be backwards.
-                    if ( (bigEndian ^ BitConverter.IsLittleEndian) && 
-
-#if BIT64           // 64 bit CPU needs to be long aligned for this to work.
-                          charLeftOver == 0 && (unchecked((long)chars) & 7) == 0)
+                    if ( (bigEndian ^ BitConverter.IsLittleEndian) &&
+#if BIT64
+                        (unchecked((long)chars) & 7) == 0 &&
 #else
-                          charLeftOver == 0 && (unchecked((int)chars) & 3) == 0)
+                        (unchecked((int)chars) & 3) == 0 &&
 #endif
+                        charLeftOver == 0)
                     {
+                        // Need -1 to check 2 at a time.  If we have an even #, longChars will go
+                        // from longEnd - 1/2 long to longEnd + 1/2 long.  If we're odd, longChars
+                        // will go from longEnd - 1 long to longEnd. (Might not get to use this)
+                        ulong* longEnd = (ulong*)(charEnd - 3);
+
                         // Need new char* so we can check 4 at a time
                         ulong* longChars = (ulong*)chars;
 
@@ -452,7 +455,8 @@ namespace System.Text
 
                                     // If they happen to be high/low/high/low, we may as well continue.  Check the next
                                     // bit to see if its set (low) or not (high) in the right pattern
-                                    if (((0xfc00fc00fc00fc00 & *longChars) ^ highLowPatternMask) != 0)
+                                    if ((0xfc00fc00fc00fc00 & *longChars) !=
+                                            (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
                                     {
                                         // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
                                         // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
@@ -475,7 +479,7 @@ namespace System.Text
                         if (chars >= charEnd)
                             break;
                     }
-#endif // !NO_FAST_UNICODE_LOOP
+#endif // FASTLOOP
 
                     // No fallback, just get next char
                     ch = *chars;
@@ -695,14 +699,14 @@ namespace System.Text
                 if (ch == 0)
                 {
                     // No fallback, maybe we can do it fast
-#if !NO_FAST_UNICODE_LOOP
+#if FASTLOOP
                     // If endianess is backwards then each pair of bytes would be backwards.
                     if ( (bigEndian ^ BitConverter.IsLittleEndian) && 
-#if BIT64           // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned
-                        (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
+#if BIT64
+                        (unchecked((long)chars) & 7) == 0 &&
 #else
-                        (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 &&
-#endif // BIT64
+                        (unchecked((int)chars) & 3) == 0 &&
+#endif
                         charLeftOver == 0)
                     {
                         // Need -1 to check 2 at a time.  If we have an even #, longChars will go
@@ -744,7 +748,8 @@ namespace System.Text
 
                                     // If they happen to be high/low/high/low, we may as well continue.  Check the next
                                     // bit to see if its set (low) or not (high) in the right pattern
-                                    if (((0xfc00fc00fc00fc00 & *longChars) ^ highLowPatternMask) != 0)
+                                    if ((0xfc00fc00fc00fc00 & *longChars) !=
+                                            (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
                                     {
                                         // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
                                         // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
@@ -759,7 +764,7 @@ namespace System.Text
                             // else all < 0x8000 so we can use them
 
                             // We can use these 4 chars.
-                            *longBytes = *longChars;
+                            Unsafe.WriteUnaligned<ulong>(longBytes, *longChars);
                             longChars++;
                             longBytes++;
                         }
@@ -770,62 +775,7 @@ namespace System.Text
                         if (chars >= charEnd)
                             break;
                     }
-                    // Not aligned, but maybe we can still be somewhat faster
-                    // Also somehow this optimizes the above loop?  It seems to cause something above
-                    // to get enregistered, but I haven't figured out how to make that happen without this loop.
-                    else if ((charLeftOver == 0) &&
-                        (bigEndian ^ BitConverter.IsLittleEndian) && 
-
-#if BIT64
-                        (unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) &&  // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time
-#else
-                        (unchecked((int)chars) & 3) != (unchecked((int)bytes) & 3) &&  // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time
-#endif // BIT64
-                        (unchecked((int)(bytes)) & 1) == 0)
-                    {
-                        // # to use
-                        long iCount = ((byteEnd - bytes) >> 1 < charEnd - chars) ?
-                                       (byteEnd - bytes) >> 1 : charEnd - chars;
-
-                        // Need new char*
-                        char* charOut = ((char*)bytes);     // a char* for our output
-                        char* tempEnd = chars + iCount - 1; // Our end pointer
-
-                        while (chars < tempEnd)
-                        {
-                            if (*chars >= (char)0xd800 && *chars <= (char)0xdfff)
-                            {
-                                // break for fallback for low surrogate
-                                if (*chars >= 0xdc00)
-                                    break;
-
-                                // break if next one's not a low surrogate (will do fallback)
-                                if (*(chars + 1) < 0xdc00 || *(chars + 1) > 0xdfff)
-                                    break;
-
-                                // They both exist, use them
-                            }
-                            // If 2nd char is surrogate & this one isn't then only add one
-                            else if (*(chars + 1) >= (char)0xd800 && *(chars + 1) <= 0xdfff)
-                            {
-                                *charOut = *chars;
-                                charOut++;
-                                chars++;
-                                continue;
-                            }
-
-                            *charOut = *chars;
-                            *(charOut + 1) = *(chars + 1);
-                            charOut += 2;
-                            chars += 2;
-                        }
-
-                        bytes = (byte*)charOut;
-
-                        if (chars >= charEnd)
-                            break;
-                    }
-#endif // !NO_FAST_UNICODE_LOOP
+#endif // FASTLOOP
 
                     // No fallback, just get next char
                     ch = *chars;
@@ -1054,9 +1004,6 @@ namespace System.Text
                 encoder == null || !encoder._throwOnOverflow,
                 "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting");
 
-            // We used to copy it fast, but this doesn't check for surrogates
-            // System.IO.__UnmanagedMemoryStream.memcpyimpl(bytes, (byte*)chars, usedByteCount);
-
             return (int)(bytes - byteStart);
         }
 
@@ -1077,11 +1024,6 @@ namespace System.Text
             // Start by assuming same # of chars as bytes
             int charCount = count >> 1;
 
-            // Need -1 to check 2 at a time.  If we have an even #, longBytes will go
-            // from longEnd - 1/2 long to longEnd + 1/2 long.  If we're odd, longBytes
-            // will go from longEnd - 1 long to longEnd. (Might not get to use this)
-            ulong* longEnd = (ulong*)(byteEnd - 7);
-
             // For fallback we may need a fallback buffer
             DecoderFallbackBuffer fallbackBuffer = null;
 
@@ -1110,15 +1052,20 @@ namespace System.Text
             {
                 // If we're aligned then maybe we can do it fast
                 // That'll hurt if we're unaligned because we'll always test but never be aligned
-#if !NO_FAST_UNICODE_LOOP
+#if FASTLOOP
                 if ((bigEndian ^ BitConverter.IsLittleEndian) &&
-#if BIT64 // win64 has to be long aligned
+#if BIT64
                     (unchecked((long)bytes) & 7) == 0 &&
 #else
                     (unchecked((int)bytes) & 3) == 0 &&
 #endif // BIT64
                     lastByte == -1 && lastChar == 0)
                 {
+                    // Need -1 to check 2 at a time.  If we have an even #, longBytes will go
+                    // from longEnd - 1/2 long to longEnd + 1/2 long.  If we're odd, longBytes
+                    // will go from longEnd - 1 long to longEnd. (Might not get to use this)
+                    ulong* longEnd = (ulong*)(byteEnd - 7);
+
                     // Need new char* so we can check 4 at a time
                     ulong* longBytes = (ulong*)bytes;
 
@@ -1149,7 +1096,8 @@ namespace System.Text
 
                                 // If they happen to be high/low/high/low, we may as well continue.  Check the next
                                 // bit to see if its set (low) or not (high) in the right pattern
-                                if (((0xfc00fc00fc00fc00 & *longBytes) ^ highLowPatternMask) != 0)
+                                if ((0xfc00fc00fc00fc00 & *longBytes) !=
+                                        (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
                                 {
                                     // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
                                     // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
@@ -1172,7 +1120,7 @@ namespace System.Text
                     if (bytes >= byteEnd)
                         break;
                 }
-#endif // !NO_FAST_UNICODE_LOOP
+#endif // FASTLOOP
 
                 // Get 1st byte
                 if (lastByte < 0)
@@ -1425,13 +1373,13 @@ namespace System.Text
             {
                 // If we're aligned then maybe we can do it fast
                 // That'll hurt if we're unaligned because we'll always test but never be aligned
-#if !NO_FAST_UNICODE_LOOP
+#if FASTLOOP
                 if ((bigEndian ^ BitConverter.IsLittleEndian) &&
-#if BIT64 // win64 has to be long aligned
-                    (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 &&
+#if BIT64
+                    (unchecked((long)chars) & 7) == 0 &&
 #else
-                    (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 &&
-#endif // BIT64
+                    (unchecked((int)chars) & 3) == 0 &&
+#endif
                     lastByte == -1 && lastChar == 0)
                 {
                     // Need -1 to check 2 at a time.  If we have an even #, longChars will go
@@ -1473,7 +1421,8 @@ namespace System.Text
 
                                 // If they happen to be high/low/high/low, we may as well continue.  Check the next
                                 // bit to see if its set (low) or not (high) in the right pattern
-                                if (((0xfc00fc00fc00fc00 & *longBytes) ^ highLowPatternMask) != 0)
+                                if ((0xfc00fc00fc00fc00 & *longBytes) !=
+                                        (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
                                 {
                                     // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
                                     // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
@@ -1488,7 +1437,7 @@ namespace System.Text
                         // else all < 0x8000 so we can use them
 
                         // We can use these 4 chars.
-                        *longChars = *longBytes;
+                        Unsafe.WriteUnaligned<ulong>(longChars, *longBytes);
                         longBytes++;
                         longChars++;
                     }
@@ -1499,7 +1448,7 @@ namespace System.Text
                     if (bytes >= byteEnd)
                         break;
                 }
-#endif // !NO_FAST_UNICODE_LOOP
+#endif // FASTLOOP
 
                 // Get 1st byte
                 if (lastByte < 0)
@@ -1808,9 +1757,6 @@ namespace System.Text
                 decoder.lastByte = lastByte;
             }
 
-            // Used to do this the old way
-            // System.IO.__UnmanagedMemoryStream.memcpyimpl((byte*)chars, bytes, byteCount);
-
             // Shouldn't have anything in fallback buffer for GetChars
             // (don't have to check _throwOnOverflow for count or chars)
             Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
-- 
2.7.4