Improve performance of Marvin32 (dotnet/coreclr#22816)

author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>

Fri, 29 Mar 2019 05:25:16 +0000 (22:25 -0700)

committer GitHub <noreply@github.com>

Fri, 29 Mar 2019 05:25:16 +0000 (22:25 -0700)
author Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
Fri, 29 Mar 2019 05:25:16 +0000 (22:25 -0700)
committer GitHub <noreply@github.com>
Fri, 29 Mar 2019 05:25:16 +0000 (22:25 -0700)
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs

index 1a4357a..8551830 100644 (file)
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs
@@ -181,7 +181,7 @@ namespace System
              // TODO_UTF8STRING: Consider whether this should use a different seed than String.GetHashCode.
  
              ulong seed = Marvin.DefaultSeed;
-            return Marvin.ComputeHash32(ref DangerousGetMutableReference(), _length /* in bytes */, (uint)seed, (uint)(seed >> 32));
+            return Marvin.ComputeHash32(ref DangerousGetMutableReference(), (uint)_length /* in bytes */, (uint)seed, (uint)(seed >> 32));
          }
  
          /// <summary>
diff --git a/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs

index f55a9f5..beab0cf 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs
@@ -81,8 +81,8 @@ namespace System
              Debug.Assert(charsWritten == count); // invariant case conversion should involve simple folding; preserve code unit count
  
              // Slice the array to the size returned by ToUpperInvariant.
-            // Multiplication below may overflow, that's fine since it's going to an unsigned integer.
-            int hash = ComputeHash32(ref Unsafe.As<char, byte>(ref MemoryMarshal.GetReference(scratch)), charsWritten * 2, p0, p1);
+            // Multiplication below will not overflow since going from positive Int32 to UInt32.
+            int hash = ComputeHash32(ref Unsafe.As<char, byte>(ref MemoryMarshal.GetReference(scratch)), (uint)charsWritten * 2, p0, p1);
  
              // Return the borrowed array if necessary.
              if (borrowedArr != null)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Marvin.cs b/src/libraries/System.Private.CoreLib/src/System/Marvin.cs

index 6cf2040..832c84b 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/Marvin.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Marvin.cs
@@ -22,80 +22,205 @@ namespace System
          /// Compute a Marvin hash and collapse it into a 32-bit hash.
          /// </summary>
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static int ComputeHash32(ReadOnlySpan<byte> data, ulong seed) => ComputeHash32(ref MemoryMarshal.GetReference(data), data.Length, (uint)seed, (uint)(seed >> 32));
+        public static int ComputeHash32(ReadOnlySpan<byte> data, ulong seed) => ComputeHash32(ref MemoryMarshal.GetReference(data), (uint)data.Length, (uint)seed, (uint)(seed >> 32));
  
          /// <summary>
          /// Compute a Marvin hash and collapse it into a 32-bit hash.
          /// </summary>
-        public static int ComputeHash32(ref byte data, int count, uint p0, uint p1)
+        public static int ComputeHash32(ref byte data, uint count, uint p0, uint p1)
          {
-            nuint ucount = (nuint)count;
-            nuint byteOffset = 0;
+            // Control flow of this method generally flows top-to-bottom, trying to
+            // minimize the number of branches taken for large (>= 8 bytes, 4 chars) inputs.
+            // If small inputs (< 8 bytes, 4 chars) are given, this jumps to a "small inputs"
+            // handler at the end of the method.
  
-            while (ucount >= 8)
+            if (count < 8)
              {
-                p0 += Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref data, byteOffset));
-                Block(ref p0, ref p1);
+                // We can't run the main loop, but we might still have 4 or more bytes available to us.
+                // If so, jump to the 4 .. 7 bytes logic immediately after the main loop.
+
+                if (count >= 4)
+                {
+                    goto Between4And7BytesRemain;
+                }
+                else
+                {
+                    goto InputTooSmallToEnterMainLoop;
+                }
+            }
+
+            // Main loop - read 8 bytes at a time.
+            // The block function is unrolled 2x in this loop.
+
+            uint loopCount = count / 8;
+            Debug.Assert(loopCount > 0, "Shouldn't reach this code path for small inputs.");
+
+            do
+            {
+                // Most x86 processors have two dispatch ports for reads, so we can read 2x 32-bit
+                // values in parallel. We opt for this instead of a single 64-bit read since the
+                // typical use case for Marvin32 is computing String hash codes, and the particular
+                // layout of String instances means the starting data is never 8-byte aligned when
+                // running in a 64-bit process.
  
-                p0 += Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref data, byteOffset + 4));
+                p0 += Unsafe.ReadUnaligned<uint>(ref data);
+                uint nextUInt32 = Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref data, 4));
+
+                // One block round for each of the 32-bit integers we just read, 2x rounds total.
+
+                Block(ref p0, ref p1);
+                p0 += nextUInt32;
                  Block(ref p0, ref p1);
  
-                byteOffset += 8;
-                ucount -= 8;
+                // Bump the data reference pointer and decrement the loop count.
+
+                // Decrementing by 1 every time and comparing against zero allows the JIT to produce
+                // better codegen compared to a standard 'for' loop with an incrementing counter.
+                // Requires https://github.com/dotnet/coreclr/issues/7566 to be addressed first
+                // before we can realize the full benefits of this.
+
+                data = ref Unsafe.AddByteOffset(ref data, 8);
+            } while (--loopCount > 0);
+
+            // n.b. We've not been updating the original 'count' parameter, so its actual value is
+            // still the original data length. However, we can still rely on its least significant
+            // 3 bits to tell us how much data remains (0 .. 7 bytes) after the loop above is
+            // completed.
+
+            if ((count & 0b_0100) == 0)
+            {
+                goto DoFinalPartialRead;
              }
  
-            switch (ucount)
+        Between4And7BytesRemain:
+
+            // If after finishing the main loop we still have 4 or more leftover bytes, or if we had
+            // 4 .. 7 bytes to begin with and couldn't enter the loop in the first place, we need to
+            // consume 4 bytes immediately and send them through one round of the block function.
+
+            Debug.Assert(count >= 4, "Only should've gotten here if the original count was >= 4.");
+
+            p0 += Unsafe.ReadUnaligned<uint>(ref data);
+            Block(ref p0, ref p1);
+
+        DoFinalPartialRead:
+
+            // Finally, we have 0 .. 3 bytes leftover. Since we know the original data length was at
+            // least 4 bytes (smaller lengths are handled at the end of this routine), we can safely
+            // read the 4 bytes at the end of the buffer without reading past the beginning of the
+            // original buffer. This necessarily means the data we're about to read will overlap with
+            // some data we've already processed, but we can handle that below.
+
+            Debug.Assert(count >= 4, "Only should've gotten here if the original count was >= 4.");
+
+            // Read the last 4 bytes of the buffer.
+
+            uint partialResult = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref Unsafe.AddByteOffset(ref data, (nuint)count & 7), -4));
+
+            // The 'partialResult' local above contains any data we have yet to read, plus some number
+            // of bytes which we've already read from the buffer. An example of this is given below
+            // for little-endian architectures. In this table, AA BB CC are the bytes which we still
+            // need to consume, and ## are bytes which we want to throw away since we've already
+            // consumed them as part of a previous read.
+            //
+            //                                                    (partialResult contains)   (we want it to contain)
+            // count mod 4 = 0 -> [ ## ## ## ## |             ] -> 0x####_####             -> 0x0000_0080
+            // count mod 4 = 1 -> [ ## ## ## ## | AA          ] -> 0xAA##_####             -> 0x0000_80AA
+            // count mod 4 = 2 -> [ ## ## ## ## | AA BB       ] -> 0xBBAA_####             -> 0x0080_BBAA
+            // count mod 4 = 3 -> [ ## ## ## ## | AA BB CC    ] -> 0xCCBB_AA##             -> 0x80CC_BBAA
+
+            count = ~count << 3;
+
+            if (BitConverter.IsLittleEndian)
              {
-                case 4:
-                    p0 += Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref data, byteOffset));
-                    Block(ref p0, ref p1);
-                    goto case 0;
-
-                case 0:
-                    p0 += 0x80u;
-                    break;
-
-                case 5:
-                    p0 += Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref data, byteOffset));
-                    byteOffset += 4;
-                    Block(ref p0, ref p1);
-                    goto case 1;
-
-                case 1:
-                    p0 += 0x8000u | Unsafe.AddByteOffset(ref data, byteOffset);
-                    break;
-
-                case 6:
-                    p0 += Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref data, byteOffset));
-                    byteOffset += 4;
-                    Block(ref p0, ref p1);
-                    goto case 2;
-
-                case 2:
-                    p0 += 0x800000u | Unsafe.ReadUnaligned<ushort>(ref Unsafe.AddByteOffset(ref data, byteOffset));
-                    break;
-
-                case 7:
-                    p0 += Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref data, byteOffset));
-                    byteOffset += 4;
-                    Block(ref p0, ref p1);
-                    goto case 3;
-
-                case 3:
-                    p0 += 0x80000000u | (((uint)(Unsafe.AddByteOffset(ref data, byteOffset + 2))) << 16) | (uint)(Unsafe.ReadUnaligned<ushort>(ref Unsafe.AddByteOffset(ref data, byteOffset)));
-                    break;
-
-                default:
-                    Debug.Fail("Should not get here.");
-                    break;
+                partialResult >>= 8; // make some room for the 0x80 byte
+                partialResult |= 0x8000_0000u; // put the 0x80 byte at the beginning
+                partialResult >>= (int)count & 0x1F; // shift out all previously consumed bytes
              }
+            else
+            {
+                partialResult <<= 8; // make some room for the 0x80 byte
+                partialResult |= 0x80u; // put the 0x80 byte at the end
+                partialResult <<= (int)count & 0x1F; // shift out all previously consumed bytes
+            }
+
+        DoFinalRoundsAndReturn:
  
+            // Now that we've computed the final partial result, merge it in and run two rounds of
+            // the block function to finish out the Marvin algorithm.
+
+            p0 += partialResult;
              Block(ref p0, ref p1);
              Block(ref p0, ref p1);
  
              return (int)(p1 ^ p0);
+
+        InputTooSmallToEnterMainLoop:
+
+            // We had only 0 .. 3 bytes to begin with, so we can't perform any 32-bit reads.
+            // This means that we're going to be building up the final result right away and
+            // will only ever run two rounds total of the block function. Let's initialize
+            // the partial result to "no data".
+
+            if (BitConverter.IsLittleEndian)
+            {
+                partialResult = 0x80u;
+            }
+            else
+            {
+                partialResult = 0x80000000u;
+            }
+
+            if ((count & 0b_0001) != 0)
+            {
+                // If the buffer is 1 or 3 bytes in length, let's read a single byte now
+                // and merge it into our partial result. This will result in partialResult
+                // having one of the two values below, where AA BB CC are the buffer bytes.
+                // 
+                //                  (little-endian / big-endian)
+                // [ AA          ]  -> 0x0000_80AA / 0xAA80_0000
+                // [ AA BB CC    ]  -> 0x0000_80CC / 0xCC80_0000
+
+                partialResult = Unsafe.AddByteOffset(ref data, (nuint)count & 2);
+
+                if (BitConverter.IsLittleEndian)
+                {
+                    partialResult |= 0x8000;
+                }
+                else
+                {
+                    partialResult <<= 24;
+                    partialResult |= 0x800000u;
+                }
+            }
+
+            if ((count & 0b_0010) != 0)
+            {
+                // If the buffer is 2 or 3 bytes in length, let's read a single ushort now
+                // and merge it into the partial result. This will result in partialResult
+                // having one of the two values below, where AA BB CC are the buffer bytes.
+                // 
+                //                  (little-endian / big-endian)
+                // [ AA BB       ]  -> 0x0080_BBAA / 0xAABB_8000
+                // [ AA BB CC    ]  -> 0x80CC_BBAA / 0xAABB_CC80 (carried over from above)
+
+                if (BitConverter.IsLittleEndian)
+                {
+                    partialResult <<= 16;
+                    partialResult |= (uint)Unsafe.ReadUnaligned<ushort>(ref data);
+                }
+                else
+                {
+                    partialResult |= (uint)Unsafe.ReadUnaligned<ushort>(ref data);
+                    partialResult = BitOperations.RotateLeft(partialResult, 16);
+                }
+            }
+
+            // Everything is consumed! Go perform the final rounds and return.
+
+            goto DoFinalRoundsAndReturn;
          }
-        
+
          [MethodImpl(MethodImplOptions.AggressiveInlining)]
          private static void Block(ref uint rp0, ref uint rp1)
          {
diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Comparison.cs b/src/libraries/System.Private.CoreLib/src/System/String.Comparison.cs

index 826848a..23781b6 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/String.Comparison.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/String.Comparison.cs
@@ -748,7 +748,9 @@ namespace System
          public override int GetHashCode()
          {
              ulong seed = Marvin.DefaultSeed;
-            return Marvin.ComputeHash32(ref Unsafe.As<char, byte>(ref _firstChar), _stringLength * 2 /* in bytes, not chars */, (uint)seed, (uint)(seed >> 32));
+
+            // Multiplication below will not overflow since going from positive Int32 to UInt32.
+            return Marvin.ComputeHash32(ref Unsafe.As<char, byte>(ref _firstChar), (uint)_stringLength * 2 /* in bytes, not chars */, (uint)seed, (uint)(seed >> 32));
          }
  
          // Gets a hash code for this string and this comparison. If strings A and B and comparison C are such
@@ -767,7 +769,9 @@ namespace System
          public static int GetHashCode(ReadOnlySpan<char> value)
          {
              ulong seed = Marvin.DefaultSeed;
-            return Marvin.ComputeHash32(ref Unsafe.As<char, byte>(ref MemoryMarshal.GetReference(value)), value.Length * 2 /* in bytes, not chars */, (uint)seed, (uint)(seed >> 32));
+
+            // Multiplication below will not overflow since going from positive Int32 to UInt32.
+            return Marvin.ComputeHash32(ref Unsafe.As<char, byte>(ref MemoryMarshal.GetReference(value)), (uint)value.Length * 2 /* in bytes, not chars */, (uint)seed, (uint)(seed >> 32));
          }
  
          // A span-based equivalent of String.GetHashCode(StringComparison). Uses the specified comparison type.
author	Levi Broderick <GrabYourPitchforks@users.noreply.github.com>
	Fri, 29 Mar 2019 05:25:16 +0000 (22:25 -0700)
committer	GitHub <noreply@github.com>
	Fri, 29 Mar 2019 05:25:16 +0000 (22:25 -0700)
src/coreclr/src/System.Private.CoreLib/src/System/Utf8String.cs		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/Marvin.cs		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/String.Comparison.cs		patch \| blob \| history