Vectorize the CRC64 implementation (#85221)
authorBrant Burnett <bburnett@centeredgesoftware.com>
Thu, 18 May 2023 18:09:37 +0000 (14:09 -0400)
committerGitHub <noreply@github.com>
Thu, 18 May 2023 18:09:37 +0000 (20:09 +0200)
THIRD-PARTY-NOTICES.TXT
src/libraries/System.IO.Hashing/src/System.IO.Hashing.csproj
src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.Vectorized.cs
src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs [new file with mode: 0644]
src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.cs
src/libraries/System.IO.Hashing/src/System/IO/Hashing/VectorHelper.cs [new file with mode: 0644]
src/libraries/System.IO.Hashing/tests/Crc64Tests.cs

index 13c41e4..5385581 100644 (file)
@@ -1231,6 +1231,7 @@ License for Fast CRC Computation
 --------------------------------------
 
 https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/crc/crc32_ieee_by4.asm
+https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/crc/crc64_ecma_norm_by8.asm
 
 Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
index 85072ee..3384ea5 100644 (file)
@@ -21,6 +21,8 @@ System.IO.Hashing.XxHash32</PackageDescription>
              Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net7.0'))" />
     <Compile Include="System\IO\Hashing\Crc64.cs" />
     <Compile Include="System\IO\Hashing\Crc64.Table.cs" />
+    <Compile Include="System\IO\Hashing\Crc64.Vectorized.cs"
+             Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net7.0'))" />
     <Compile Include="System\IO\Hashing\XxHash128.cs" />
     <Compile Include="System\IO\Hashing\XxHash3.cs" />
     <Compile Include="System\IO\Hashing\XxHash32.cs" />
@@ -31,6 +33,8 @@ System.IO.Hashing.XxHash32</PackageDescription>
     <Compile Include="System\IO\Hashing\NonCryptographicHashAlgorithm.cs" />
     <Compile Include="System\IO\Hashing\BitOperations.cs"
              Condition="'$(TargetFrameworkIdentifier)' != '.NETCoreApp'" />
+    <Compile Include="System\IO\Hashing\VectorHelper.cs"
+             Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net7.0'))" />
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetFrameworkIdentifier)' != '.NETCoreApp'">
index f584efa..3a26cab 100644 (file)
@@ -2,94 +2,21 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Diagnostics;
-using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
 using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics.Arm;
-using Aes = System.Runtime.Intrinsics.Arm.Aes;
 
 namespace System.IO.Hashing
 {
     public partial class Crc32
     {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<ulong> CarrylessMultiplyLower(Vector128<ulong> left, Vector128<ulong> right)
-        {
-            if (Pclmulqdq.IsSupported)
-            {
-                return Pclmulqdq.CarrylessMultiply(left, right, 0x00);
-            }
-
-            if (Aes.IsSupported)
-            {
-                return Aes.PolynomialMultiplyWideningLower(left.GetLower(), right.GetLower());
-            }
-
-            ThrowHelper.ThrowUnreachableException();
-            return default;
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<ulong> CarrylessMultiplyUpper(Vector128<ulong> left, Vector128<ulong> right)
-        {
-            if (Pclmulqdq.IsSupported)
-            {
-                return Pclmulqdq.CarrylessMultiply(left, right, 0x11);
-            }
-
-            if (Aes.IsSupported)
-            {
-                return Aes.PolynomialMultiplyWideningUpper(left, right);
-            }
-
-            ThrowHelper.ThrowUnreachableException();
-            return default;
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<ulong> CarrylessMultiplyLeftLowerRightUpper(Vector128<ulong> left, Vector128<ulong> right)
-        {
-            if (Pclmulqdq.IsSupported)
-            {
-                return Pclmulqdq.CarrylessMultiply(left, right, 0x10);
-            }
-
-            if (Aes.IsSupported)
-            {
-                return Aes.PolynomialMultiplyWideningLower(left.GetLower(), right.GetUpper());
-            }
-
-            ThrowHelper.ThrowUnreachableException();
-            return default;
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<ulong> ShiftRightBytesInVector(Vector128<ulong> operand,
-            [ConstantExpected(Max = (byte)15)] byte numBytesToShift)
-        {
-            if (Sse2.IsSupported)
-            {
-                return Sse2.ShiftRightLogical128BitLane(operand, numBytesToShift);
-            }
-
-            if (AdvSimd.IsSupported)
-            {
-                return AdvSimd.ExtractVector128(operand.AsByte(), Vector128<byte>.Zero, numBytesToShift).AsUInt64();
-            }
-
-            ThrowHelper.ThrowUnreachableException();
-            return default;
-        }
-
         // We check for little endian byte order here in case we're ever on ARM in big endian mode.
         // All of these checks except the length check are elided by JIT, so the JITted implementation
         // will be either a return false or a length check against a constant. This means this method
         // should be inlined into the caller.
         private static bool CanBeVectorized(ReadOnlySpan<byte> source) =>
             BitConverter.IsLittleEndian
-            && (Pclmulqdq.IsSupported || (Aes.IsSupported && AdvSimd.IsSupported))
+            && VectorHelper.IsSupported
             && source.Length >= Vector128<byte>.Count * 4;
 
         // Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics,
@@ -123,15 +50,15 @@ namespace System.IO.Hashing
             // Parallel fold blocks of 64, if any.
             while (length >= Vector128<byte>.Count * 4)
             {
-                x5 = CarrylessMultiplyLower(x1, x0);
-                Vector128<ulong> x6 = CarrylessMultiplyLower(x2, x0);
-                Vector128<ulong> x7 = CarrylessMultiplyLower(x3, x0);
-                Vector128<ulong> x8 = CarrylessMultiplyLower(x4, x0);
+                x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
+                Vector128<ulong> x6 = VectorHelper.CarrylessMultiplyLower(x2, x0);
+                Vector128<ulong> x7 = VectorHelper.CarrylessMultiplyLower(x3, x0);
+                Vector128<ulong> x8 = VectorHelper.CarrylessMultiplyLower(x4, x0);
 
-                x1 = CarrylessMultiplyUpper(x1, x0);
-                x2 = CarrylessMultiplyUpper(x2, x0);
-                x3 = CarrylessMultiplyUpper(x3, x0);
-                x4 = CarrylessMultiplyUpper(x4, x0);
+                x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
+                x2 = VectorHelper.CarrylessMultiplyUpper(x2, x0);
+                x3 = VectorHelper.CarrylessMultiplyUpper(x3, x0);
+                x4 = VectorHelper.CarrylessMultiplyUpper(x4, x0);
 
                 Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
                 Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
@@ -155,18 +82,18 @@ namespace System.IO.Hashing
             // Fold into 128-bits.
             x0 = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4
 
-            x5 = CarrylessMultiplyLower(x1, x0);
-            x1 = CarrylessMultiplyUpper(x1, x0);
+            x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
+            x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
             x1 ^= x2;
             x1 ^= x5;
 
-            x5 = CarrylessMultiplyLower(x1, x0);
-            x1 = CarrylessMultiplyUpper(x1, x0);
+            x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
+            x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
             x1 ^= x3;
             x1 ^= x5;
 
-            x5 = CarrylessMultiplyLower(x1, x0);
-            x1 = CarrylessMultiplyUpper(x1, x0);
+            x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
+            x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
             x1 ^= x4;
             x1 ^= x5;
 
@@ -175,8 +102,8 @@ namespace System.IO.Hashing
             {
                 x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
 
-                x5 = CarrylessMultiplyLower(x1, x0);
-                x1 = CarrylessMultiplyUpper(x1, x0);
+                x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
+                x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
                 x1 ^= x2;
                 x1 ^= x5;
 
@@ -185,25 +112,25 @@ namespace System.IO.Hashing
             }
 
             // Fold 128 bits to 64 bits.
-            x2 = CarrylessMultiplyLeftLowerRightUpper(x1, x0);
+            x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x1, x0);
             x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
-            x1 = ShiftRightBytesInVector(x1, 8);
+            x1 = VectorHelper.ShiftRightBytesInVector(x1, 8);
             x1 ^= x2;
 
             x0 = Vector128.CreateScalar(0x0163cd6124UL); // k5, k0
 
-            x2 = ShiftRightBytesInVector(x1, 4);
+            x2 = VectorHelper.ShiftRightBytesInVector(x1, 4);
             x1 &= x3;
-            x1 = CarrylessMultiplyLower(x1, x0);
+            x1 = VectorHelper.CarrylessMultiplyLower(x1, x0);
             x1 ^= x2;
 
             // Reduce to 32 bits.
             x0 = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial
 
             x2 = x1 & x3;
-            x2 = CarrylessMultiplyLeftLowerRightUpper(x2, x0);
+            x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x2, x0);
             x2 &= x3;
-            x2 = CarrylessMultiplyLower(x2, x0);
+            x2 = VectorHelper.CarrylessMultiplyLower(x2, x0);
             x1 ^= x2;
 
             // Process the remaining bytes, if any
diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs
new file mode 100644 (file)
index 0000000..cb2339c
--- /dev/null
@@ -0,0 +1,159 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.InteropServices;
+
+namespace System.IO.Hashing
+{
+    public partial class Crc64
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<ulong> LoadFromSource(ref byte source, nuint elementOffset)
+        {
+            Vector128<byte> vector = Vector128.LoadUnsafe(ref source, elementOffset);
+
+            if (BitConverter.IsLittleEndian)
+            {
+                // Reverse the byte order.
+
+                // SSSE3 is required to get PSHUFB acceleration for Vector128.Shuffle on x86/x64.
+                // However, the gains from vectorizing the rest of the operations seem to to be
+                // greater than the added cost of emulating the shuffle, so we don't require SSSE3 support.
+                vector = Vector128.Shuffle(vector,
+                    Vector128.Create((byte)0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
+                        0x02, 0x01, 0x00));
+            }
+
+            return vector.AsUInt64();
+        }
+
+        // All of these checks except the length check are elided by JIT, so the JITted implementation
+        // will be either a return false or a length check against a constant. This means this method
+        // should be inlined into the caller.
+        private static bool CanBeVectorized(ReadOnlySpan<byte> source) => VectorHelper.IsSupported && source.Length >= Vector128<byte>.Count;
+
+        // Processes the bytes in source in 128 byte chunks using intrinsics, followed by processing 16
+        // byte chunks, and then processing remaining bytes individually. Requires at least 16 bytes of data.
+        // Requires little endian byte order and support for PCLMULQDQ intrinsics on Intel architecture
+        // or AES and AdvSimd intrinsics on ARM architecture. Based on the algorithm put forth in the Intel paper
+        // "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" in December, 2009 and the
+        // Intel reference implementation.
+        // https://github.com/intel/isa-l/blob/33a2d9484595c2d6516c920ce39a694c144ddf69/crc/crc64_ecma_norm_by8.asm
+        private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
+        {
+            Debug.Assert(CanBeVectorized(source), "source cannot be vectorized.");
+
+            // Work with a reference to where we're at in the ReadOnlySpan and a local length
+            // to avoid extraneous range checks.
+            ref byte srcRef = ref MemoryMarshal.GetReference(source);
+            int length = source.Length;
+
+            Vector128<ulong> x7; // Accumulator for the new CRC
+            Vector128<ulong> kConstants; // Used to store reused constants
+
+            if (length >= Vector128<byte>.Count * 16) // At least 256 bytes
+            {
+                // Load the first 128 bytes
+                Vector128<ulong> x0 = LoadFromSource(ref srcRef, 0);
+                Vector128<ulong> x1 = LoadFromSource(ref srcRef, 16);
+                Vector128<ulong> x2 = LoadFromSource(ref srcRef, 32);
+                Vector128<ulong> x3 = LoadFromSource(ref srcRef, 48);
+                Vector128<ulong> x4 = LoadFromSource(ref srcRef, 64);
+                Vector128<ulong> x5 = LoadFromSource(ref srcRef, 80);
+                Vector128<ulong> x6 = LoadFromSource(ref srcRef, 96);
+                x7 = LoadFromSource(ref srcRef, 112);
+
+                srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 8);
+                length -= Vector128<byte>.Count * 8;
+
+                // Load and XOR the initial CRC value
+                // CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+                // because data will be byte-reflected and will align with initial crc at correct place.
+                x0 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
+
+                kConstants = Vector128.Create(0x5cf79dea9ac37d6UL, 0x001067e571d7d5c2UL); // k3, k4
+
+                // Parallel fold blocks of 128
+                do
+                {
+                    Vector128<ulong> y1 = LoadFromSource(ref srcRef, 0);
+                    Vector128<ulong> y2 = LoadFromSource(ref srcRef, 16);
+                    x0 = VectorHelper.FoldPolynomialPair(y1, x0, kConstants);
+                    x1 = VectorHelper.FoldPolynomialPair(y2, x1, kConstants);
+
+                    y1 = LoadFromSource(ref srcRef, 32);
+                    y2 = LoadFromSource(ref srcRef, 48);
+                    x2 = VectorHelper.FoldPolynomialPair(y1, x2, kConstants);
+                    x3 = VectorHelper.FoldPolynomialPair(y2, x3, kConstants);
+
+                    y1 = LoadFromSource(ref srcRef, 64);
+                    y2 = LoadFromSource(ref srcRef, 80);
+                    x4 = VectorHelper.FoldPolynomialPair(y1, x4, kConstants);
+                    x5 = VectorHelper.FoldPolynomialPair(y2, x5, kConstants);
+
+                    y1 = LoadFromSource(ref srcRef, 96);
+                    y2 = LoadFromSource(ref srcRef, 112);
+                    x6 = VectorHelper.FoldPolynomialPair(y1, x6, kConstants);
+                    x7 = VectorHelper.FoldPolynomialPair(y2, x7, kConstants);
+
+                    srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 8);
+                    length -= Vector128<byte>.Count * 8;
+                } while (length >= Vector128<byte>.Count * 8);
+
+                // Fold into 128-bits in x7
+                x7 = VectorHelper.FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
+                x7 = VectorHelper.FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
+                x7 = VectorHelper.FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
+                x7 = VectorHelper.FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
+                x7 = VectorHelper.FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
+                x7 = VectorHelper.FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
+                x7 = VectorHelper.FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
+            }
+            else
+            {
+                // For shorter sources just load the first vector and XOR with the CRC
+                Debug.Assert(length >= 16);
+
+                x7 = LoadFromSource(ref srcRef, 0);
+
+                // Load and XOR the initial CRC value
+                // CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
+                // because the data will be byte-reflected and will align with initial crc at correct place.
+                x7 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
+
+                srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
+                length -= Vector128<byte>.Count;
+            }
+
+            // Single fold blocks of 16, if any, into x7
+            while (length >= Vector128<byte>.Count)
+            {
+                x7 = VectorHelper.FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
+                    Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
+
+                srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
+                length -= Vector128<byte>.Count;
+            }
+
+            // Compute CRC of a 128-bit value and fold to the upper 64-bits
+            x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
+                 VectorHelper.ShiftLowerToUpper(x7);
+
+            // Barrett reduction
+            kConstants = Vector128.Create(0x578d29d06cc4f872UL, 0x42f0e1eba9ea3693UL); // k7, k8
+            Vector128<ulong> temp = x7;
+            x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
+            x7 = VectorHelper.CarrylessMultiplyUpper(x7, kConstants);
+            x7 ^= temp;
+
+            // Process the remaining bytes, if any
+            ulong result = x7.GetElement(0);
+            return length > 0
+                ? UpdateScalar(result, MemoryMarshal.CreateReadOnlySpan(ref srcRef, length))
+                : result;
+        }
+    }
+}
index 7ec30ba..e9993a7 100644 (file)
@@ -167,6 +167,18 @@ namespace System.IO.Hashing
 
         private static ulong Update(ulong crc, ReadOnlySpan<byte> source)
         {
+#if NET7_0_OR_GREATER
+            if (CanBeVectorized(source))
+            {
+                return UpdateVectorized(crc, source);
+            }
+#endif
+
+            return UpdateScalar(crc, source);
+        }
+
+        private static ulong UpdateScalar(ulong crc, ReadOnlySpan<byte> source)
+        {
             ReadOnlySpan<ulong> crcLookup = CrcLookup;
             for (int i = 0; i < source.Length; i++)
             {
diff --git a/src/libraries/System.IO.Hashing/src/System/IO/Hashing/VectorHelper.cs b/src/libraries/System.IO.Hashing/src/System/IO/Hashing/VectorHelper.cs
new file mode 100644 (file)
index 0000000..8e9d140
--- /dev/null
@@ -0,0 +1,134 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+using Aes = System.Runtime.Intrinsics.Arm.Aes;
+
+namespace System.IO.Hashing
+{
+    // Helpers which provide equivalent intrinsics for Intel and ARM architectures. Should only be used
+    // if the intrinsics are available.
+    internal static class VectorHelper
+    {
+        // Pclmulqdq implies support for SSE2
+        public static bool IsSupported => Pclmulqdq.IsSupported || (Aes.IsSupported && AdvSimd.IsSupported);
+
+        // Performs carryless multiplication of the upper pairs of source and constants and the lower pairs of source and constants,
+        // then folds them into target using carryless addition.
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> FoldPolynomialPair(Vector128<ulong> target, Vector128<ulong> source, Vector128<ulong> constants)
+        {
+            target ^= CarrylessMultiplyUpper(source, constants);
+            target ^= CarrylessMultiplyLower(source, constants);
+
+            return target;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> CarrylessMultiplyLower(Vector128<ulong> left, Vector128<ulong> right)
+        {
+            if (Pclmulqdq.IsSupported)
+            {
+                return Pclmulqdq.CarrylessMultiply(left, right, 0x00);
+            }
+
+            if (Aes.IsSupported)
+            {
+                return Aes.PolynomialMultiplyWideningLower(left.GetLower(), right.GetLower());
+            }
+
+            ThrowHelper.ThrowUnreachableException();
+            return default;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> CarrylessMultiplyUpper(Vector128<ulong> left, Vector128<ulong> right)
+        {
+            if (Pclmulqdq.IsSupported)
+            {
+                return Pclmulqdq.CarrylessMultiply(left, right, 0x11);
+            }
+
+            if (Aes.IsSupported)
+            {
+                return Aes.PolynomialMultiplyWideningUpper(left, right);
+            }
+
+            ThrowHelper.ThrowUnreachableException();
+            return default;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> CarrylessMultiplyLeftUpperRightLower(Vector128<ulong> left, Vector128<ulong> right)
+        {
+            if (Pclmulqdq.IsSupported)
+            {
+                return Pclmulqdq.CarrylessMultiply(left, right, 0x01);
+            }
+
+            if (Aes.IsSupported)
+            {
+                return Aes.PolynomialMultiplyWideningLower(left.GetUpper(), right.GetLower());
+            }
+
+            ThrowHelper.ThrowUnreachableException();
+            return default;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> CarrylessMultiplyLeftLowerRightUpper(Vector128<ulong> left, Vector128<ulong> right)
+        {
+            if (Pclmulqdq.IsSupported)
+            {
+                return Pclmulqdq.CarrylessMultiply(left, right, 0x10);
+            }
+
+            if (Aes.IsSupported)
+            {
+                return Aes.PolynomialMultiplyWideningLower(left.GetLower(), right.GetUpper());
+            }
+
+            ThrowHelper.ThrowUnreachableException();
+            return default;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> ShiftRightBytesInVector(Vector128<ulong> operand,
+            [ConstantExpected(Max = (byte)15)] byte numBytesToShift)
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.ShiftRightLogical128BitLane(operand, numBytesToShift);
+            }
+
+            if (AdvSimd.IsSupported)
+            {
+                return AdvSimd.ExtractVector128(operand.AsByte(), Vector128<byte>.Zero, numBytesToShift).AsUInt64();
+            }
+
+            ThrowHelper.ThrowUnreachableException();
+            return default;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> ShiftLowerToUpper(Vector128<ulong> operand)
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.ShiftLeftLogical128BitLane(operand, 8);
+            }
+
+            if (AdvSimd.IsSupported)
+            {
+                return AdvSimd.ExtractVector128(Vector128<byte>.Zero, operand.AsByte(), 8).AsUInt64();
+            }
+
+            ThrowHelper.ThrowUnreachableException();
+            return default;
+        }
+    }
+}
index c439ec8..12621dc 100644 (file)
@@ -69,6 +69,26 @@ namespace System.IO.Hashing.Tests
                     "The quick brown fox jumps over the lazy dog",
                     "The quick brown fox jumps over the lazy dog"u8.ToArray(),
                     "41E05242FFA9883B"),
+                // Test 256 bytes for vector optimizations
+                new TestCase(
+                    "Lorem ipsum 256",
+                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent non ipsum quis mauris euismod hendrerit id sed lacus. Duis quam neque, porta et volutpat nec, tempor eget nisl. Nunc quis leo quis nisi mattis molestie. Donec a diam velit. Sed a tempus nec."u8.ToArray(),
+                    "DA70046E6B79DD83"),
+                // Test a multiple of 128 bytes greater than 256 bytes + 16 bytes for vector optimizations
+                new TestCase(
+                    "Lorem ipsum 272",
+                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent non ipsum quis mauris euismod hendrerit id sed lacus. Duis quam neque, porta et volutpat nec, tempor eget nisl. Nunc quis leo quis nisi mattis molestie. Donec a diam velit. Sed a tempus nec1234567890abcdef."u8.ToArray(),
+                    "A94F5E9C5557F65A"),
+                // Test a multiple of 128 bytes greater than 256 bytes for vector optimizations
+                new TestCase(
+                    "Lorem ipsum 384",
+                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam lobortis non felis et pretium. Suspendisse commodo dignissim sagittis. Etiam vestibulum luctus mollis. Ut finibus, nisl in sodales sagittis, leo mauris sollicitudin odio, id sodales nisl ante vitae quam. Nunc ut mi at lacus ultricies efficitur vitae eu ligula. Donec tincidunt, nisi suscipit facilisis auctor, metus non."u8.ToArray(),
+                    "5768E3F2E9A63829"),
+                // Test data that is > 256 bytes but not a multiple of 16 for vector optimizations
+                new TestCase(
+                     "Lorem ipsum 1001",
+                     "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer ac urna vitae nibh sagittis porttitor et vel ante. Ut molestie sit amet velit ac mattis. Sed ullamcorper nunc non neque imperdiet, vehicula bibendum sapien efficitur. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Suspendisse potenti. Duis sem dui, malesuada non pharetra at, feugiat id mi. Nulla facilisi. Fusce a scelerisque magna. Ut leo justo, auctor quis nisi et, sollicitudin pretium odio. Sed eu nibh mollis, pretium lectus nec, posuere nulla. Morbi ac euismod purus. Morbi rhoncus leo est, at volutpat nunc pretium in. Aliquam erat volutpat. Curabitur eu lacus mollis, varius lectus ut, tincidunt eros. Nullam a velit hendrerit, euismod magna id, fringilla sem. Phasellus scelerisque hendrerit est, vel imperdiet enim auctor a. Aenean vel ultricies nunc. Suspendisse ac tincidunt urna. Nulla tempor dolor ut ligula accumsan, tempus auctor massa gravida. Aenean non odio et augue pellena."u8.ToArray(),
+                     "3ECF3A363FC5BD59"),
             };
 
         protected override NonCryptographicHashAlgorithm CreateInstance() => new Crc64();