Vectorize shorter buffers for CRC-32 on Intel (#86539)
authorBrant Burnett <bburnett@centeredgesoftware.com>
Mon, 22 May 2023 14:58:56 +0000 (10:58 -0400)
committerGitHub <noreply@github.com>
Mon, 22 May 2023 14:58:56 +0000 (16:58 +0200)
src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.Vectorized.cs
src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs

index 3a26cab..47cc9f1 100644 (file)
@@ -5,6 +5,7 @@ using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.InteropServices;
+using static System.IO.Hashing.VectorHelper;
 
 namespace System.IO.Hashing
 {
@@ -17,7 +18,9 @@ namespace System.IO.Hashing
         private static bool CanBeVectorized(ReadOnlySpan<byte> source) =>
             BitConverter.IsLittleEndian
             && VectorHelper.IsSupported
-            && source.Length >= Vector128<byte>.Count * 4;
+            // Vectorization can process spans as short as a single vector (16 bytes), but if ARM intrinsics are supported they
+            // seem to be more performant for spans less than 8 vectors (128 bytes).
+            && source.Length >= Vector128<byte>.Count * (System.Runtime.Intrinsics.Arm.Crc32.IsSupported ? 8 : 1);
 
         // Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics,
         // followed by processing 16 byte chunks, and then processing remaining bytes individually. Requires
@@ -35,102 +38,81 @@ namespace System.IO.Hashing
             ref byte srcRef = ref MemoryMarshal.GetReference(source);
             int length = source.Length;
 
-            Vector128<ulong> x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
-            Vector128<ulong> x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
-            Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
-            Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
-            Vector128<ulong> x5;
+            Vector128<ulong> kConstants;
+            Vector128<ulong> x1; // Accumulator for the new CRC
+            Vector128<ulong> x2;
 
-            x1 ^= Vector128.CreateScalar(crc).AsUInt64();
-            Vector128<ulong> x0 = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2
-
-            srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
-            length -= Vector128<byte>.Count * 4;
-
-            // Parallel fold blocks of 64, if any.
-            while (length >= Vector128<byte>.Count * 4)
+            if (length >= Vector128<byte>.Count * 8)
             {
-                x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
-                Vector128<ulong> x6 = VectorHelper.CarrylessMultiplyLower(x2, x0);
-                Vector128<ulong> x7 = VectorHelper.CarrylessMultiplyLower(x3, x0);
-                Vector128<ulong> x8 = VectorHelper.CarrylessMultiplyLower(x4, x0);
-
-                x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
-                x2 = VectorHelper.CarrylessMultiplyUpper(x2, x0);
-                x3 = VectorHelper.CarrylessMultiplyUpper(x3, x0);
-                x4 = VectorHelper.CarrylessMultiplyUpper(x4, x0);
-
-                Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
-                Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
-                Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
-                Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
-
-                x1 ^= x5;
-                x2 ^= x6;
-                x3 ^= x7;
-                x4 ^= x8;
-
-                x1 ^= y5;
-                x2 ^= y6;
-                x3 ^= y7;
-                x4 ^= y8;
+                x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
+                x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
+                Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
+                Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
 
                 srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
                 length -= Vector128<byte>.Count * 4;
-            }
-
-            // Fold into 128-bits.
-            x0 = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4
 
-            x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
-            x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
-            x1 ^= x2;
-            x1 ^= x5;
+                // Load and XOR the initial CRC value
+                x1 ^= Vector128.CreateScalar(crc).AsUInt64();
+
+                kConstants = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2
+
+                // Parallel fold blocks of 64, if any.
+                do
+                {
+                    Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
+                    Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
+                    Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
+                    Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
+
+                    x1 = FoldPolynomialPair(y5, x1, kConstants);
+                    x2 = FoldPolynomialPair(y6, x2, kConstants);
+                    x3 = FoldPolynomialPair(y7, x3, kConstants);
+                    x4 = FoldPolynomialPair(y8, x4, kConstants);
+
+                    srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
+                    length -= Vector128<byte>.Count * 4;
+                } while (length >= Vector128<byte>.Count * 4);
+
+                // Fold into 128-bits.
+                kConstants = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4
+                x1 = FoldPolynomialPair(x2, x1, kConstants);
+                x1 = FoldPolynomialPair(x3, x1, kConstants);
+                x1 = FoldPolynomialPair(x4, x1, kConstants);
+            }
+            else
+            {
+                // For shorter sources just load the first vector and XOR with the CRC
+                Debug.Assert(length >= 16);
 
-            x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
-            x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
-            x1 ^= x3;
-            x1 ^= x5;
+                x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
+                x1 ^= Vector128.CreateScalar(crc).AsUInt64();
 
-            x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
-            x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
-            x1 ^= x4;
-            x1 ^= x5;
+                srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
+                length -= Vector128<byte>.Count;
+            }
 
             // Single fold blocks of 16, if any.
             while (length >= Vector128<byte>.Count)
             {
-                x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
-
-                x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
-                x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
-                x1 ^= x2;
-                x1 ^= x5;
+                x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1,
+                    Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL));
 
                 srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
                 length -= Vector128<byte>.Count;
             }
 
             // Fold 128 bits to 64 bits.
-            x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x1, x0);
-            x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
-            x1 = VectorHelper.ShiftRightBytesInVector(x1, 8);
-            x1 ^= x2;
-
-            x0 = Vector128.CreateScalar(0x0163cd6124UL); // k5, k0
-
-            x2 = VectorHelper.ShiftRightBytesInVector(x1, 4);
-            x1 &= x3;
-            x1 = VectorHelper.CarrylessMultiplyLower(x1, x0);
-            x1 ^= x2;
+            Vector128<ulong> bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
+            x1 = ShiftRightBytesInVector(x1, 8) ^
+                 CarrylessMultiplyLower(x1, Vector128.CreateScalar(0x00ccaa009eUL));
+            x1 = CarrylessMultiplyLower(x1 & bitmask, Vector128.CreateScalar(0x0163cd6124UL)) ^ // k5, k0
+                 ShiftRightBytesInVector(x1, 4);
 
             // Reduce to 32 bits.
-            x0 = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial
-
-            x2 = x1 & x3;
-            x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x2, x0);
-            x2 &= x3;
-            x2 = VectorHelper.CarrylessMultiplyLower(x2, x0);
+            kConstants = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial
+            x2 = CarrylessMultiplyLeftLowerRightUpper(x1 & bitmask, kConstants) & bitmask;
+            x2 = CarrylessMultiplyLower(x2, kConstants);
             x1 ^= x2;
 
             // Process the remaining bytes, if any
index cb2339c..095bbce 100644 (file)
@@ -5,6 +5,7 @@ using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.InteropServices;
+using static System.IO.Hashing.VectorHelper;
 
 namespace System.IO.Hashing
 {
@@ -72,7 +73,7 @@ namespace System.IO.Hashing
                 // Load and XOR the initial CRC value
                 // CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
                 // because data will be byte-reflected and will align with initial crc at correct place.
-                x0 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
+                x0 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));
 
                 kConstants = Vector128.Create(0x5cf79dea9ac37d6UL, 0x001067e571d7d5c2UL); // k3, k4
 
@@ -81,36 +82,36 @@ namespace System.IO.Hashing
                 {
                     Vector128<ulong> y1 = LoadFromSource(ref srcRef, 0);
                     Vector128<ulong> y2 = LoadFromSource(ref srcRef, 16);
-                    x0 = VectorHelper.FoldPolynomialPair(y1, x0, kConstants);
-                    x1 = VectorHelper.FoldPolynomialPair(y2, x1, kConstants);
+                    x0 = FoldPolynomialPair(y1, x0, kConstants);
+                    x1 = FoldPolynomialPair(y2, x1, kConstants);
 
                     y1 = LoadFromSource(ref srcRef, 32);
                     y2 = LoadFromSource(ref srcRef, 48);
-                    x2 = VectorHelper.FoldPolynomialPair(y1, x2, kConstants);
-                    x3 = VectorHelper.FoldPolynomialPair(y2, x3, kConstants);
+                    x2 = FoldPolynomialPair(y1, x2, kConstants);
+                    x3 = FoldPolynomialPair(y2, x3, kConstants);
 
                     y1 = LoadFromSource(ref srcRef, 64);
                     y2 = LoadFromSource(ref srcRef, 80);
-                    x4 = VectorHelper.FoldPolynomialPair(y1, x4, kConstants);
-                    x5 = VectorHelper.FoldPolynomialPair(y2, x5, kConstants);
+                    x4 = FoldPolynomialPair(y1, x4, kConstants);
+                    x5 = FoldPolynomialPair(y2, x5, kConstants);
 
                     y1 = LoadFromSource(ref srcRef, 96);
                     y2 = LoadFromSource(ref srcRef, 112);
-                    x6 = VectorHelper.FoldPolynomialPair(y1, x6, kConstants);
-                    x7 = VectorHelper.FoldPolynomialPair(y2, x7, kConstants);
+                    x6 = FoldPolynomialPair(y1, x6, kConstants);
+                    x7 = FoldPolynomialPair(y2, x7, kConstants);
 
                     srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 8);
                     length -= Vector128<byte>.Count * 8;
                 } while (length >= Vector128<byte>.Count * 8);
 
                 // Fold into 128-bits in x7
-                x7 = VectorHelper.FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
-                x7 = VectorHelper.FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
-                x7 = VectorHelper.FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
-                x7 = VectorHelper.FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
-                x7 = VectorHelper.FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
-                x7 = VectorHelper.FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
-                x7 = VectorHelper.FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
+                x7 = FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
+                x7 = FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
+                x7 = FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
+                x7 = FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
+                x7 = FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
+                x7 = FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
+                x7 = FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
             }
             else
             {
@@ -122,7 +123,7 @@ namespace System.IO.Hashing
                 // Load and XOR the initial CRC value
                 // CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
                 // because the data will be byte-reflected and will align with initial crc at correct place.
-                x7 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
+                x7 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));
 
                 srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
                 length -= Vector128<byte>.Count;
@@ -131,7 +132,7 @@ namespace System.IO.Hashing
             // Single fold blocks of 16, if any, into x7
             while (length >= Vector128<byte>.Count)
             {
-                x7 = VectorHelper.FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
+                x7 = FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
                     Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
 
                 srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
@@ -139,14 +140,14 @@ namespace System.IO.Hashing
             }
 
             // Compute CRC of a 128-bit value and fold to the upper 64-bits
-            x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
-                 VectorHelper.ShiftLowerToUpper(x7);
+            x7 = CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
+                 ShiftLowerToUpper(x7);
 
             // Barrett reduction
             kConstants = Vector128.Create(0x578d29d06cc4f872UL, 0x42f0e1eba9ea3693UL); // k7, k8
             Vector128<ulong> temp = x7;
-            x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
-            x7 = VectorHelper.CarrylessMultiplyUpper(x7, kConstants);
+            x7 = CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
+            x7 = CarrylessMultiplyUpper(x7, kConstants);
             x7 ^= temp;
 
             // Process the remaining bytes, if any