// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Diagnostics;
+using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
#pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228
internal readonly struct ProbabilisticMap
- private const int IndexMask = 0x7;
- private const int IndexShift = 0x3;
+ // The vectorized algorithm operates on bytes instead of uint32s.
+ // The index and shift are adjusted so that we represent the structure
+ // as "32 x uint8" instead of "8 x uint32".
+ // We use the vectorized implementation when we have access to Sse41 or Arm64 intrinsics.
+ private const uint VectorizedIndexMask = 31u;
+ private const int VectorizedIndexShift = 5;
+ // If we don't support vectorization, use uint32 to speed up
+ // "IsCharBitSet" checks in scalar loops.
+ private const uint PortableIndexMask = 7u;
+ private const int PortableIndexShift = 3;
private readonly uint _e0, _e1, _e2, _e3, _e4, _e5, _e6, _e7;
if (hasAscii)
// Common to search for ASCII symbols. Just set the high value once.
- charMap |= 1u;
+ SetCharBit(ref charMap, 0);
- private static void SetCharBit(ref uint charMap, byte value) =>
- Unsafe.Add(ref charMap, (uint)value & IndexMask) |= 1u << (value >> IndexShift);
+ private static void SetCharBit(ref uint charMap, byte value)
+ {
+ if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported)
+ {
+ Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift));
+ }
+ else
+ {
+ Unsafe.Add(ref charMap, value & PortableIndexMask) |= 1u << (value >> PortableIndexShift);
+ }
+ }
- private static bool IsCharBitSet(ref uint charMap, byte value) =>
- (Unsafe.Add(ref charMap, (uint)value & IndexMask) & (1u << (value >> IndexShift))) != 0;
+ private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported
+ ? (Unsafe.Add(ref Unsafe.As<uint, byte>(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0
+ : (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0;
internal static bool Contains(ref uint charMap, ReadOnlySpan<char> values, int ch) =>
IsCharBitSet(ref charMap, (byte)ch) &&
IsCharBitSet(ref charMap, (byte)(ch >> 8)) &&
- values.Contains((char)ch);
+ Contains(values, (char)ch);
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static bool Contains(ReadOnlySpan<char> values, char ch) =>
+ SpanHelpers.NonPackedContainsValueType(
+ ref Unsafe.As<char, short>(ref MemoryMarshal.GetReference(values)),
+ (short)ch,
+ values.Length);
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector256<byte> ContainsMask32CharsAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, ref char searchSpace)
+ {
+ Vector256<ushort> source0 = Vector256.LoadUnsafe(ref searchSpace);
+ Vector256<ushort> source1 = Vector256.LoadUnsafe(ref searchSpace, (nuint)Vector256<ushort>.Count);
+ Vector256<byte> sourceLower = Avx2.PackUnsignedSaturate(
+ (source0 & Vector256.Create((ushort)255)).AsInt16(),
+ (source1 & Vector256.Create((ushort)255)).AsInt16());
+ Vector256<byte> sourceUpper = Avx2.PackUnsignedSaturate(
+ (source0 >>> 8).AsInt16(),
+ (source1 >>> 8).AsInt16());
+ Vector256<byte> resultLower = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceLower);
+ Vector256<byte> resultUpper = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceUpper);
+ return resultLower & resultUpper;
+ }
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector256<byte> IsCharBitSetAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, Vector256<byte> values)
+ {
+ // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
+ Vector256<byte> highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15);
+ Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble);
+ Vector256<byte> index = values & Vector256.Create((byte)VectorizedIndexMask);
+ Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
+ Vector256<byte> bitMaskUpper = Avx2.Shuffle(charMapUpper, index - Vector256.Create((byte)16));
+ Vector256<byte> mask = Vector256.GreaterThan(index, Vector256.Create((byte)15));
+ Vector256<byte> bitMask = Vector256.ConditionalSelect(mask, bitMaskUpper, bitMaskLower);
+ return ~Vector256.Equals(bitMask & bitPositions, Vector256<byte>.Zero);
+ }
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, ref char searchSpace)
+ {
+ Vector128<ushort> source0 = Vector128.LoadUnsafe(ref searchSpace);
+ Vector128<ushort> source1 = Vector128.LoadUnsafe(ref searchSpace, (nuint)Vector128<ushort>.Count);
+ Vector128<byte> sourceLower = Sse2.IsSupported
+ ? Sse2.PackUnsignedSaturate((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16())
+ : AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte());
+ Vector128<byte> sourceUpper = Sse2.IsSupported
+ ? Sse2.PackUnsignedSaturate((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16())
+ : AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte());
+ Vector128<byte> resultLower = IsCharBitSet(charMapLower, charMapUpper, sourceLower);
+ Vector128<byte> resultUpper = IsCharBitSet(charMapLower, charMapUpper, sourceUpper);
+ return resultLower & resultUpper;
+ }
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<byte> IsCharBitSet(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, Vector128<byte> values)
+ {
+ // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
+ Vector128<byte> highNibble = Sse2.IsSupported
+ ? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15)
+ : values >>> VectorizedIndexShift;
+ Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble);
+ Vector128<byte> index = values & Vector128.Create((byte)VectorizedIndexMask);
+ Vector128<byte> bitMaskLower = Vector128.ShuffleUnsafe(charMapLower, index);
+ Vector128<byte> bitMaskUpper = Vector128.ShuffleUnsafe(charMapUpper, index - Vector128.Create((byte)16));
+ Vector128<byte> mask = Vector128.GreaterThan(index, Vector128.Create((byte)15));
+ Vector128<byte> bitMask = Vector128.ConditionalSelect(mask, bitMaskUpper, bitMaskLower);
+ return ~Vector128.Equals(bitMask & bitPositions, Vector128<byte>.Zero);
+ }
private static bool ShouldUseSimpleLoop(int searchSpaceLength, int valuesLength)
while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd))
char c = cur;
- if (TNegator.NegateIfNeeded(valuesSpan.Contains(c)))
+ if (TNegator.NegateIfNeeded(Contains(valuesSpan, c)))
return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
for (int i = searchSpaceLength - 1; i >= 0; i--)
char c = Unsafe.Add(ref searchSpace, i);
- if (TNegator.NegateIfNeeded(valuesSpan.Contains(c)))
+ if (TNegator.NegateIfNeeded(Contains(valuesSpan, c)))
return i;
internal static int IndexOfAny<TNegator>(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan<char> values)
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
+ if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= 16)
+ {
+ return IndexOfAnyVectorized(ref charMap, ref searchSpace, searchSpaceLength, values);
+ }
ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
ref char cur = ref searchSpace;
int ch = cur;
if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch)))
- return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
+ return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char));
cur = ref Unsafe.Add(ref cur, 1);
return -1;
+ private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan<char> values)
+ {
+ Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported);
+ Debug.Assert(searchSpaceLength >= 16);
+ ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength);
+ ref char cur = ref searchSpace;
+ Vector128<byte> charMapLower = Vector128.LoadUnsafe(ref Unsafe.As<uint, byte>(ref charMap));
+ Vector128<byte> charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As<uint, byte>(ref charMap), (nuint)Vector128<byte>.Count);
+ if (Avx2.IsSupported && searchSpaceLength >= 32)
+ {
+ Vector256<byte> charMapLower256 = Vector256.Create(charMapLower, charMapLower);
+ Vector256<byte> charMapUpper256 = Vector256.Create(charMapUpper, charMapUpper);
+ ref char lastStartVectorAvx2 = ref Unsafe.Subtract(ref searchSpaceEnd, 32);
+ while (true)
+ {
+ Vector256<byte> result = ContainsMask32CharsAvx2(charMapLower256, charMapUpper256, ref cur);
+ if (result != Vector256<byte>.Zero)
+ {
+ // Account for how ContainsMask32CharsAvx2 packed the source chars (Avx2.PackUnsignedSaturate).
+ result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte();
+ uint mask = result.ExtractMostSignificantBits();
+ do
+ {
+ ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask));
+ if (Contains(values, candidatePos))
+ {
+ return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char));
+ }
+ mask = BitOperations.ResetLowestSetBit(mask);
+ }
+ while (mask != 0);
+ }
+ cur = ref Unsafe.Add(ref cur, 32);
+ if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVectorAvx2))
+ {
+ if (Unsafe.AreSame(ref cur, ref searchSpaceEnd))
+ {
+ return -1;
+ }
+ if (Unsafe.ByteOffset(ref cur, ref searchSpaceEnd) > 16 * sizeof(char))
+ {
+ // If we have more than 16 characters left to process, we can
+ // adjust the current vector and do one last iteration of Avx2.
+ cur = ref lastStartVectorAvx2;
+ }
+ else
+ {
+ // Otherwise adjust the vector such that we'll only need to do a single
+ // iteration of ContainsMask16Chars below.
+ cur = ref Unsafe.Subtract(ref searchSpaceEnd, 16);
+ break;
+ }
+ }
+ }
+ }
+ ref char lastStartVector = ref Unsafe.Subtract(ref searchSpaceEnd, 16);
+ while (true)
+ {
+ Vector128<byte> result = ContainsMask16Chars(charMapLower, charMapUpper, ref cur);
+ if (result != Vector128<byte>.Zero)
+ {
+ uint mask = result.ExtractMostSignificantBits();
+ do
+ {
+ ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask));
+ if (Contains(values, candidatePos))
+ {
+ return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char));
+ }
+ mask = BitOperations.ResetLowestSetBit(mask);
+ }
+ while (mask != 0);
+ }
+ cur = ref Unsafe.Add(ref cur, 16);
+ if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVector))
+ {
+ if (Unsafe.AreSame(ref cur, ref searchSpaceEnd))
+ {
+ break;
+ }
+ // Adjust the current vector and do one last iteration.
+ cur = ref lastStartVector;
+ }
+ }
+ return -1;
+ }
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
using System.Runtime.Intrinsics.X86;
namespace System.Runtime.Intrinsics
return Unsafe.ReadUnaligned<Vector128<T>>(ref Unsafe.As<T, byte>(ref source));
+ /// <summary>Loads a vector from the given source and reinterprets it as <see cref="ushort"/>.</summary>
+ /// <param name="source">The source from which the vector will be loaded.</param>
+ /// <returns>The vector loaded from <paramref name="source" />.</returns>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static Vector128<ushort> LoadUnsafe(ref char source) =>
+ LoadUnsafe(ref Unsafe.As<char, ushort>(ref source));
+ /// <summary>Loads a vector from the given source and element offset and reinterprets it as <see cref="ushort"/>.</summary>
+ /// <param name="source">The source to which <paramref name="elementOffset" /> will be added before loading the vector.</param>
+ /// <param name="elementOffset">The element offset from <paramref name="source" /> from which the vector will be loaded.</param>
+ /// <returns>The vector loaded from <paramref name="source" /> plus <paramref name="elementOffset" />.</returns>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static Vector128<ushort> LoadUnsafe(ref char source, nuint elementOffset) =>
+ LoadUnsafe(ref Unsafe.As<char, ushort>(ref source), elementOffset);
/// <summary>Computes the maximum of two vectors on a per-element basis.</summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="left">The vector to compare with <paramref name="right" />.</param>
return result;
+ /// <summary>Creates a new vector by selecting values from an input vector using a set of indices.
+ /// Behavior is platform-dependent for out-of-range indices.</summary>
+ /// <param name="vector">The input vector from which values are selected.</param>
+ /// <param name="indices">The per-element indices used to select a value from <paramref name="vector" />.</param>
+ /// <returns>A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.</returns>
+ /// <remarks>Unlike Shuffle, this method delegates to the underlying hardware intrinsic without ensuring that <paramref name="indices"/> are normalized to [0, 15].
+ /// On hardware with <see cref="Ssse3"/> support, indices are treated as modulo 16, and if the high bit is set, the result will be set to 0 for that element.
+ /// On hardware with <see cref="AdvSimd.Arm64"/> or <see cref="PackedSimd"/> support, this method behaves the same as Shuffle.</remarks>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static Vector128<byte> ShuffleUnsafe(Vector128<byte> vector, Vector128<byte> indices)
+ {
+ if (Ssse3.IsSupported)
+ {
+ return Ssse3.Shuffle(vector, indices);
+ }
+ if (AdvSimd.Arm64.IsSupported)
+ {
+ return AdvSimd.Arm64.VectorTableLookup(vector, indices);
+ }
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.Swizzle(vector, indices);
+ }
+ return Shuffle(vector, indices);
+ }
/// <summary>Creates a new vector by selecting values from an input vector using a set of indices.</summary>
/// <param name="vector">The input vector from which values are selected.</param>
/// <param name="indices">The per-element indices used to select a value from <paramref name="vector" />.</param>
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
- ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);
if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
// Find the last unique (which is not equal to ch1) character
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector256<ushort>.Count <= searchSpaceLength);
- Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
- Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
+ Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
// Make sure we don't go out of bounds
Debug.Assert(offset + ch1ch2Distance + Vector128<ushort>.Count <= searchSpaceLength);
- Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
- Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
+ Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
- ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);
if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256<ushort>.Count)
offset = searchSpaceMinusValueTailLength - Vector256<ushort>.Count;
- Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
- Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset));
+ Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector256<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros
- Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset));
- Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset));
+ Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
Vector128<byte> cmpAnd = (cmpCh1 & cmpCh2).AsByte();
// Early out: cmpAnd is all zeros