Vectorize ProbabilisticMap.IndexOfAny (#80963)
authorMiha Zupan <mihazupan.zupan1@gmail.com>
Fri, 10 Mar 2023 04:17:27 +0000 (05:17 +0100)
committerGitHub <noreply@github.com>
Fri, 10 Mar 2023 04:17:27 +0000 (23:17 -0500)
* Vectorize ProbabilisticMap.IndexOfAny on AVX2

* Use ResetLowestSetBit from BitOperations

* Speed up Avx2 and add Vector128 support

* Add Vector{128/256}.LoadUnsafe(ref char) and Vector128.ShuffleUnsafe

* Use Vector128.ShuffleUnsafe in more places

* PR feedback

* Replace another ShiftRightLogical with '>>>'

* Add WASM path to Vector128.ShuffleUnsafe

* PR feedback

src/libraries/Common/src/System/HexConverter.cs
src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs
src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs
src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs
src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs
src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs

index 7986ccb43a9b1fa1231a867e51b9c4761de0b6ee..81b56970be05a116ea9dd9706e63506411500ae0 100644 (file)
@@ -99,13 +99,8 @@ namespace System
             Vector128<byte> lowNibbles = Vector128.UnpackLow(shiftedSrc, src);
             Vector128<byte> highNibbles = Vector128.UnpackHigh(shiftedSrc, src);
 
-            return (ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)),
-                ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF)));
-
-            // TODO: remove once https://github.com/dotnet/runtime/pull/80963 is merged
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static Vector128<byte> ShuffleUnsafe(Vector128<byte> value, Vector128<byte> mask)
-                => Ssse3.IsSupported ? Ssse3.Shuffle(value, mask) : AdvSimd.Arm64.VectorTableLookup(value, mask);
+            return (Vector128.ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)),
+                Vector128.ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF)));
         }
 
         private static void EncodeToUtf16_Vector128(ReadOnlySpan<byte> bytes, Span<char> chars, Casing casing)
index f4ebc942ac331a110a8844c1875ee88c6c7d0c0f..cc239d1a5e981a5e858483a12bda1c4232abfbce 100644 (file)
@@ -477,20 +477,17 @@ namespace System.Buffers.Text
             destBytes = dest;
         }
 
-        // This can be replaced once https://github.com/dotnet/runtime/issues/63331 is implemented.
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte> right, Vector128<byte> mask8F)
         {
             Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian);
 
-            if (Ssse3.IsSupported)
+            if (AdvSimd.Arm64.IsSupported)
             {
-                return Ssse3.Shuffle(left, right);
-            }
-            else
-            {
-                return AdvSimd.Arm64.VectorTableLookup(left, Vector128.BitwiseAnd(right, mask8F));
+                right &= mask8F;
             }
+
+            return Vector128.ShuffleUnsafe(left, right);
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
index 1dd36f67306615f28a01973e1b3718a628d22e64..45007ac7d22b52cecc3f8e86d0170fffaf2eb86c 100644 (file)
@@ -88,8 +88,8 @@ namespace System.Globalization
             Vector128<ushort> vec2;
             do
             {
-                vec1 = Vector128.LoadUnsafe(ref Unsafe.As<char, ushort>(ref charA), i);
-                vec2 = Vector128.LoadUnsafe(ref Unsafe.As<char, ushort>(ref charB), i);
+                vec1 = Vector128.LoadUnsafe(ref charA, i);
+                vec2 = Vector128.LoadUnsafe(ref charB, i);
 
                 if (!Utf16Utility.AllCharsInVector128AreAscii(vec1 | vec2))
                 {
index 87c258cee7983246d812b2f838eea6887c7fff00..207024d99962fb1fb386676af58240122eaa5063 100644 (file)
@@ -860,10 +860,10 @@ namespace System.Buffers
 
             // The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle.
             // Lookup the rows via the lower nibble and the column via the higher nibble.
-            Vector128<byte> bitMask = Shuffle(bitmapLookup, lowNibbles);
+            Vector128<byte> bitMask = Vector128.ShuffleUnsafe(bitmapLookup, lowNibbles);
 
             // For values above 127, the high nibble will be above 7. We construct the positions vector for the shuffle such that those values map to 0.
-            Vector128<byte> bitPositions = Shuffle(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles);
+            Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles);
 
             Vector128<byte> result = bitMask & bitPositions;
             return result;
@@ -909,10 +909,10 @@ namespace System.Buffers
             Vector128<byte> lowNibbles = source & Vector128.Create((byte)0xF);
             Vector128<byte> highNibbles = Vector128.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector128.Create((byte)0xF);
 
-            Vector128<byte> row0 = Shuffle(bitmapLookup0, lowNibbles);
-            Vector128<byte> row1 = Shuffle(bitmapLookup1, lowNibbles);
+            Vector128<byte> row0 = Vector128.ShuffleUnsafe(bitmapLookup0, lowNibbles);
+            Vector128<byte> row1 = Vector128.ShuffleUnsafe(bitmapLookup1, lowNibbles);
 
-            Vector128<byte> bitmask = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibbles);
+            Vector128<byte> bitmask = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibbles);
 
             Vector128<byte> mask = Vector128.GreaterThan(highNibbles.AsSByte(), Vector128.Create((sbyte)0x7)).AsByte();
             Vector128<byte> bitsets = Vector128.ConditionalSelect(mask, row1, row0);
@@ -944,16 +944,6 @@ namespace System.Buffers
             return TNegator.NegateIfNeeded(result);
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
-        {
-            // We're not using Vector128.Shuffle as the caller already accounts for and relies on differences in behavior between platforms.
-            return
-                Ssse3.IsSupported ? Ssse3.Shuffle(vector, indices) :
-                AdvSimd.Arm64.IsSupported ? AdvSimd.Arm64.VectorTableLookup(vector, indices) :
-                PackedSimd.Swizzle(vector, indices);
-        }
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static unsafe int ComputeFirstIndex<T, TNegator>(ref T searchSpace, ref T current, Vector128<byte> result)
             where TNegator : struct, INegator
index d0a0a7821913e12722a899d01437cb7ff1ab9552..2b13c5b19ce7e3b37cb3feca3e470743bc2552a4 100644 (file)
@@ -1,9 +1,13 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Diagnostics;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
 
 #pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228
 
@@ -23,8 +27,17 @@ namespace System.Buffers
     [StructLayout(LayoutKind.Sequential)]
     internal readonly struct ProbabilisticMap
     {
-        private const int IndexMask = 0x7;
-        private const int IndexShift = 0x3;
+        // The vectorized algorithm operates on bytes instead of uint32s.
+        // The index and shift are adjusted so that we represent the structure
+        // as "32 x uint8" instead of "8 x uint32".
+        // We use the vectorized implementation when we have access to Sse41 or Arm64 intrinsics.
+        private const uint VectorizedIndexMask = 31u;
+        private const int VectorizedIndexShift = 5;
+
+        // If we don't support vectorization, use uint32 to speed up
+        // "IsCharBitSet" checks in scalar loops.
+        private const uint PortableIndexMask = 7u;
+        private const int PortableIndexShift = 3;
 
         private readonly uint _e0, _e1, _e2, _e3, _e4, _e5, _e6, _e7;
 
@@ -56,23 +69,116 @@ namespace System.Buffers
             if (hasAscii)
             {
                 // Common to search for ASCII symbols. Just set the high value once.
-                charMap |= 1u;
+                SetCharBit(ref charMap, 0);
             }
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void SetCharBit(ref uint charMap, byte value) =>
-            Unsafe.Add(ref charMap, (uint)value & IndexMask) |= 1u << (value >> IndexShift);
+        private static void SetCharBit(ref uint charMap, byte value)
+        {
+            if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported)
+            {
+                Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift));
+            }
+            else
+            {
+                Unsafe.Add(ref charMap, value & PortableIndexMask) |= 1u << (value >> PortableIndexShift);
+            }
+        }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool IsCharBitSet(ref uint charMap, byte value) =>
-            (Unsafe.Add(ref charMap, (uint)value & IndexMask) & (1u << (value >> IndexShift))) != 0;
+        private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported
+            ? (Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0
+            : (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0;
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static bool Contains(ref uint charMap, ReadOnlySpan<char> values, int ch) =>
             IsCharBitSet(ref charMap, (byte)ch) &&
             IsCharBitSet(ref charMap, (byte)(ch >> 8)) &&
-            values.Contains((char)ch);
+            Contains(values, (char)ch);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool Contains(ReadOnlySpan<char> values, char ch) =>
+            SpanHelpers.NonPackedContainsValueType(
+                ref Unsafe.As<char, short>(ref MemoryMarshal.GetReference(values)),
+                (short)ch,
+                values.Length);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<byte> ContainsMask32CharsAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, ref char searchSpace)
+        {
+            Vector256<ushort> source0 = Vector256.LoadUnsafe(ref searchSpace);
+            Vector256<ushort> source1 = Vector256.LoadUnsafe(ref searchSpace, (nuint)Vector256<ushort>.Count);
+
+            Vector256<byte> sourceLower = Avx2.PackUnsignedSaturate(
+                (source0 & Vector256.Create((ushort)255)).AsInt16(),
+                (source1 & Vector256.Create((ushort)255)).AsInt16());
+
+            Vector256<byte> sourceUpper = Avx2.PackUnsignedSaturate(
+                (source0 >>> 8).AsInt16(),
+                (source1 >>> 8).AsInt16());
+
+            Vector256<byte> resultLower = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceLower);
+            Vector256<byte> resultUpper = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceUpper);
+
+            return resultLower & resultUpper;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<byte> IsCharBitSetAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, Vector256<byte> values)
+        {
+            // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
+            Vector256<byte> highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15);
+
+            Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble);
+
+            Vector256<byte> index = values & Vector256.Create((byte)VectorizedIndexMask);
+            Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
+            Vector256<byte> bitMaskUpper = Avx2.Shuffle(charMapUpper, index - Vector256.Create((byte)16));
+            Vector256<byte> mask = Vector256.GreaterThan(index, Vector256.Create((byte)15));
+            Vector256<byte> bitMask = Vector256.ConditionalSelect(mask, bitMaskUpper, bitMaskLower);
+
+            return ~Vector256.Equals(bitMask & bitPositions, Vector256<byte>.Zero);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, ref char searchSpace)
+        {
+            Vector128<ushort> source0 = Vector128.LoadUnsafe(ref searchSpace);
+            Vector128<ushort> source1 = Vector128.LoadUnsafe(ref searchSpace, (nuint)Vector128<ushort>.Count);
+
+            Vector128<byte> sourceLower = Sse2.IsSupported
+                ? Sse2.PackUnsignedSaturate((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16())
+                : AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte());
+
+            Vector128<byte> sourceUpper = Sse2.IsSupported
+                ? Sse2.PackUnsignedSaturate((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16())
+                : AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte());
+
+            Vector128<byte> resultLower = IsCharBitSet(charMapLower, charMapUpper, sourceLower);
+            Vector128<byte> resultUpper = IsCharBitSet(charMapLower, charMapUpper, sourceUpper);
+
+            return resultLower & resultUpper;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<byte> IsCharBitSet(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, Vector128<byte> values)
+        {
+            // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
+            Vector128<byte> highNibble = Sse2.IsSupported
+                ? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15)
+                : values >>> VectorizedIndexShift;
+
+            Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble);
+
+            Vector128<byte> index = values & Vector128.Create((byte)VectorizedIndexMask);
+            Vector128<byte> bitMaskLower = Vector128.ShuffleUnsafe(charMapLower, index);
+            Vector128<byte> bitMaskUpper = Vector128.ShuffleUnsafe(charMapUpper, index - Vector128.Create((byte)16));
+            Vector128<byte> mask = Vector128.GreaterThan(index, Vector128.Create((byte)15));
+            Vector128<byte> bitMask = Vector128.ConditionalSelect(mask, bitMaskUpper, bitMaskLower);
+
+            return ~Vector128.Equals(bitMask & bitPositions, Vector128<byte>.Zero);
+        }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool ShouldUseSimpleLoop(int searchSpaceLength, int valuesLength)
@@ -115,7 +221,7 @@ namespace System.Buffers
                 while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd))
                 {
                     char c = cur;
-                    if (TNegator.NegateIfNeeded(valuesSpan.Contains(c)))
+                    if (TNegator.NegateIfNeeded(Contains(valuesSpan, c)))
                     {
                         return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
                     }
@@ -147,7 +253,7 @@ namespace System.Buffers
                 for (int i = searchSpaceLength - 1; i >= 0; i--)
                 {
                     char c = Unsafe.Add(ref searchSpace, i);
-                    if (TNegator.NegateIfNeeded(valuesSpan.Contains(c)))
+                    if (TNegator.NegateIfNeeded(Contains(valuesSpan, c)))
                     {
                         return i;
                     }
@@ -198,6 +304,11 @@ namespace System.Buffers
         internal static int IndexOfAny<TNegator>(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan<char> values)
             where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
         {
+            if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= 16)
+            {
+                return IndexOfAnyVectorized(ref charMap, ref searchSpace, searchSpaceLength, values);
+            }
+
             ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
             ref char cur = ref searchSpace;
 
@@ -206,7 +317,7 @@ namespace System.Buffers
                 int ch = cur;
                 if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch)))
                 {
-                    return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
+                    return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
                 }
 
                 cur = ref Unsafe.Add(ref cur, 1);
@@ -230,5 +341,113 @@ namespace System.Buffers
 
             return -1;
         }
+
+        private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan<char> values)
+        {
+            Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported);
+            Debug.Assert(searchSpaceLength >= 16);
+
+            ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
+            ref char cur = ref searchSpace;
+
+            Vector128<byte> charMapLower = Vector128.LoadUnsafe(ref Unsafe.As<uint, byte>(ref charMap));
+            Vector128<byte> charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As<uint, byte>(ref charMap), (nuint)Vector128<byte>.Count);
+
+            if (Avx2.IsSupported && searchSpaceLength >= 32)
+            {
+                Vector256<byte> charMapLower256 = Vector256.Create(charMapLower, charMapLower);
+                Vector256<byte> charMapUpper256 = Vector256.Create(charMapUpper, charMapUpper);
+
+                ref char lastStartVectorAvx2 = ref Unsafe.Subtract(ref searchSpaceEnd, 32);
+
+                while (true)
+                {
+                    Vector256<byte> result = ContainsMask32CharsAvx2(charMapLower256, charMapUpper256, ref cur);
+
+                    if (result != Vector256<byte>.Zero)
+                    {
+                        // Account for how ContainsMask32CharsAvx2 packed the source chars (Avx2.PackUnsignedSaturate).
+                        result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte();
+
+                        uint mask = result.ExtractMostSignificantBits();
+                        do
+                        {
+                            ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask));
+
+                            if (Contains(values, candidatePos))
+                            {
+                                return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char));
+                            }
+
+                            mask = BitOperations.ResetLowestSetBit(mask);
+                        }
+                        while (mask != 0);
+                    }
+
+                    cur = ref Unsafe.Add(ref cur, 32);
+
+                    if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVectorAvx2))
+                    {
+                        if (Unsafe.AreSame(ref cur, ref searchSpaceEnd))
+                        {
+                            return -1;
+                        }
+
+                        if (Unsafe.ByteOffset(ref cur, ref searchSpaceEnd) > 16 * sizeof(char))
+                        {
+                            // If we have more than 16 characters left to process, we can
+                            // adjust the current vector and do one last iteration of Avx2.
+                            cur = ref lastStartVectorAvx2;
+                        }
+                        else
+                        {
+                            // Otherwise adjust the vector such that we'll only need to do a single
+                            // iteration of ContainsMask16Chars below.
+                            cur = ref Unsafe.Subtract(ref searchSpaceEnd, 16);
+                            break;
+                        }
+                    }
+                }
+            }
+
+            ref char lastStartVector = ref Unsafe.Subtract(ref searchSpaceEnd, 16);
+
+            while (true)
+            {
+                Vector128<byte> result = ContainsMask16Chars(charMapLower, charMapUpper, ref cur);
+
+                if (result != Vector128<byte>.Zero)
+                {
+                    uint mask = result.ExtractMostSignificantBits();
+                    do
+                    {
+                        ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask));
+
+                        if (Contains(values, candidatePos))
+                        {
+                            return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char));
+                        }
+
+                        mask = BitOperations.ResetLowestSetBit(mask);
+                    }
+                    while (mask != 0);
+                }
+
+                cur = ref Unsafe.Add(ref cur, 16);
+
+                if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVector))
+                {
+                    if (Unsafe.AreSame(ref cur, ref searchSpaceEnd))
+                    {
+                        break;
+                    }
+
+                    // Adjust the current vector and do one last iteration.
+                    cur = ref lastStartVector;
+                }
+            }
+
+            return -1;
+        }
     }
 }
index 8eadbbb37980c4f5b1f1e89227649ab3b2d2cbca..a8853d950e1cd68c15fdb642aa25d752c002f8af 100644 (file)
@@ -6,6 +6,7 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 
 namespace System.Runtime.Intrinsics
@@ -1820,6 +1821,21 @@ namespace System.Runtime.Intrinsics
             return Unsafe.ReadUnaligned<Vector128<T>>(ref Unsafe.As<T, byte>(ref source));
         }
 
+        /// <summary>Loads a vector from the given source and reinterprets it as <see cref="ushort"/>.</summary>
+        /// <param name="source">The source from which the vector will be loaded.</param>
+        /// <returns>The vector loaded from <paramref name="source" />.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<ushort> LoadUnsafe(ref char source) =>
+            LoadUnsafe(ref Unsafe.As<char, ushort>(ref source));
+
+        /// <summary>Loads a vector from the given source and element offset and reinterprets it as <see cref="ushort"/>.</summary>
+        /// <param name="source">The source to which <paramref name="elementOffset" /> will be added before loading the vector.</param>
+        /// <param name="elementOffset">The element offset from <paramref name="source" /> from which the vector will be loaded.</param>
+        /// <returns>The vector loaded from <paramref name="source" /> plus <paramref name="elementOffset" />.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<ushort> LoadUnsafe(ref char source, nuint elementOffset) =>
+            LoadUnsafe(ref Unsafe.As<char, ushort>(ref source), elementOffset);
+
         /// <summary>Computes the maximum of two vectors on a per-element basis.</summary>
         /// <typeparam name="T">The type of the elements in the vector.</typeparam>
         /// <param name="left">The vector to compare with <paramref name="right" />.</param>
@@ -2419,6 +2435,35 @@ namespace System.Runtime.Intrinsics
             return result;
         }
 
+        /// <summary>Creates a new vector by selecting values from an input vector using a set of indices.
+        /// Behavior is platform-dependent for out-of-range indices.</summary>
+        /// <param name="vector">The input vector from which values are selected.</param>
+        /// <param name="indices">The per-element indices used to select a value from <paramref name="vector" />.</param>
+        /// <returns>A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.</returns>
+        /// <remarks>Unlike Shuffle, this method delegates to the underlying hardware intrinsic without ensuring that <paramref name="indices"/> are normalized to [0, 15].
+        /// On hardware with <see cref="Ssse3"/> support, indices are treated as modulo 16, and if the high bit is set, the result will be set to 0 for that element.
+        /// On hardware with <see cref="AdvSimd.Arm64"/> or <see cref="PackedSimd"/> support, this method behaves the same as Shuffle.</remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector128<byte> ShuffleUnsafe(Vector128<byte> vector, Vector128<byte> indices)
+        {
+            if (Ssse3.IsSupported)
+            {
+                return Ssse3.Shuffle(vector, indices);
+            }
+
+            if (AdvSimd.Arm64.IsSupported)
+            {
+                return AdvSimd.Arm64.VectorTableLookup(vector, indices);
+            }
+
+            if (PackedSimd.IsSupported)
+            {
+                return PackedSimd.Swizzle(vector, indices);
+            }
+
+            return Shuffle(vector, indices);
+        }
+
         /// <summary>Creates a new vector by selecting values from an input vector using a set of indices.</summary>
         /// <param name="vector">The input vector from which values are selected.</param>
         /// <param name="indices">The per-element indices used to select a value from <paramref name="vector" />.</param>
index 212133baf28b332c7a71b74ab3d7e670f26dc184..340d20b0812327dd0c19b56a232e7d214fd60eb4 100644 (file)
@@ -1809,6 +1809,21 @@ namespace System.Runtime.Intrinsics
             return Unsafe.ReadUnaligned<Vector256<T>>(ref Unsafe.As<T, byte>(ref source));
         }
 
+        /// <summary>Loads a vector from the given source and reinterprets it as <see cref="ushort"/>.</summary>
+        /// <param name="source">The source from which the vector will be loaded.</param>
+        /// <returns>The vector loaded from <paramref name="source" />.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<ushort> LoadUnsafe(ref char source) =>
+            LoadUnsafe(ref Unsafe.As<char, ushort>(ref source));
+
+        /// <summary>Loads a vector from the given source and element offset and reinterprets it as <see cref="ushort"/>.</summary>
+        /// <param name="source">The source to which <paramref name="elementOffset" /> will be added before loading the vector.</param>
+        /// <param name="elementOffset">The element offset from <paramref name="source" /> from which the vector will be loaded.</param>
+        /// <returns>The vector loaded from <paramref name="source" /> plus <paramref name="elementOffset" />.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<ushort> LoadUnsafe(ref char source, nuint elementOffset) =>
+            LoadUnsafe(ref Unsafe.As<char, ushort>(ref source), elementOffset);
+
         /// <summary>Computes the maximum of two vectors on a per-element basis.</summary>
         /// <typeparam name="T">The type of the elements in the vector.</typeparam>
         /// <param name="left">The vector to compare with <paramref name="right" />.</param>
index e51370c7805383ef96a07654d44186863c077020..f48659545b9d8affd1427fd2ac909eff4988a925 100644 (file)
@@ -68,7 +68,6 @@ namespace System
             // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
             // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
         SEARCH_TWO_CHARS:
-            ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);
             if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
             {
                 // Find the last unique (which is not equal to ch1) character
@@ -89,8 +88,8 @@ namespace System
                     // Make sure we don't go out of bounds
                     Debug.Assert(offset + ch1ch2Distance + Vector256<ushort>.Count <= searchSpaceLength);
 
-                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
-                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
+                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
                     Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                     // Early out: cmpAnd is all zeros
@@ -156,8 +155,8 @@ namespace System
                     // Make sure we don't go out of bounds
                     Debug.Assert(offset + ch1ch2Distance + Vector128<ushort>.Count <= searchSpaceLength);
 
-                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
-                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
+                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
                     Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                     // Early out: cmpAnd is all zeros
@@ -254,7 +253,6 @@ namespace System
             // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
             // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
         SEARCH_TWO_CHARS:
-            ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);
             if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256<ushort>.Count)
             {
                 offset = searchSpaceMinusValueTailLength - Vector256<ushort>.Count;
@@ -272,8 +270,8 @@ namespace System
                 do
                 {
 
-                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
-                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
+                    Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                     Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                     // Early out: cmpAnd is all zeros
@@ -321,8 +319,8 @@ namespace System
 
                 do
                 {
-                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
-                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
+                    Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
+                    Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
                     Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
 
                     // Early out: cmpAnd is all zeros
index c5433157967bc6c563d2e64352d323352586b602..589c7d021e3f5dd7bc4b2493e0f395fb8b8c5edf 100644 (file)
@@ -1916,7 +1916,7 @@ namespace System
             nuint offset = 0;
             nuint lengthToExamine = (uint)sourceSpan.Length;
 
-            ref ushort source = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(sourceSpan));
+            ref char source = ref MemoryMarshal.GetReference(sourceSpan);
 
             Vector128<ushort> v1 = Vector128.Create((ushort)c);
             Vector128<ushort> v2 = Vector128.Create((ushort)c2);
@@ -1947,7 +1947,7 @@ namespace System
 
             while (offset < lengthToExamine)
             {
-                char curr = (char)Unsafe.Add(ref source, offset);
+                char curr = Unsafe.Add(ref source, offset);
                 if (curr == c || curr == c2 || curr == c3)
                 {
                     sepListBuilder.Append((int)offset);